Nina Odoux
NOTEBOOK 2¶
Este cuaderno se enfocara en data splitting y EDA analisis del data set¶
- VERIFICAR BALANCE
- SPLIT TRAIN/TEST
- CATEGORICAL AND CONTINUOUS
- UNIVARIABLE ANALISIS
- CONCLUSION UNIVARIABLE
- For each type CATEGORICAL AND CONTINUOUS analysis of:
- OUTLIERS
- MISSING VALUES
- CORRELATION
#importacion de librerias :
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import time
import gc
import psutil
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
#importar funciones de mi archivo ipynb:
import sys
sys.path.append("../src")
import eda_utils as eda
seed = 123
pd.set_option('display.max_columns', 123)
pd.set_option('display.max_rows', 5000)
Lectura de datos del preprocesado inicial¶
data = pd.read_csv("../data/preprocessed_data/01_preprocessed_data.csv")
data
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | CASH LOANS | MALE | NO | YES | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | UNACCOMPANIED | WORKING | SECONDARY / SECONDARY SPECIAL | SINGLE / NOT MARRIED | HOUSE / APARTMENT | 0.018801 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | LABORERS | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | BUSINESS ENTITY TYPE 3 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.0220 | 0.0198 | 0.0 | 0.0000 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.0000 | REG OPER ACCOUNT | BLOCK OF FLATS | 0.0149 | STONE, BRICK | NO | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | CASH LOANS | FEMALE | NO | NO | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | FAMILY | STATE SERVANT | HIGHER EDUCATION | MARRIED | HOUSE / APARTMENT | 0.003541 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | CORE STAFF | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | SCHOOL | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.0790 | 0.0554 | 0.0 | 0.0000 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.0100 | REG OPER ACCOUNT | BLOCK OF FLATS | 0.0714 | BLOCK | NO | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | REVOLVING LOANS | MALE | YES | YES | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | UNACCOMPANIED | WORKING | SECONDARY / SECONDARY SPECIAL | SINGLE / NOT MARRIED | HOUSE / APARTMENT | 0.010032 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | LABORERS | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | GOVERNMENT | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | CASH LOANS | FEMALE | NO | YES | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | UNACCOMPANIED | WORKING | SECONDARY / SECONDARY SPECIAL | CIVIL MARRIAGE | HOUSE / APARTMENT | 0.008019 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | LABORERS | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | BUSINESS ENTITY TYPE 3 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | CASH LOANS | MALE | NO | YES | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | UNACCOMPANIED | WORKING | SECONDARY / SECONDARY SPECIAL | SINGLE / NOT MARRIED | HOUSE / APARTMENT | 0.028663 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | CORE STAFF | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | RELIGION | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307506 | 456251 | 0 | CASH LOANS | MALE | NO | NO | 0 | 157500.0 | 254700.0 | 27558.0 | 225000.0 | UNACCOMPANIED | WORKING | SECONDARY / SECONDARY SPECIAL | SEPARATED | WITH PARENTS | 0.032561 | -9327 | -236 | -8456.0 | -1982 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | SALES STAFF | 1.0 | 1 | 1 | THURSDAY | 15 | 0 | 0 | 0 | 0 | 0 | 0 | SERVICES | 0.145570 | 0.681632 | NaN | 0.2021 | 0.0887 | 0.9876 | 0.8300 | 0.0202 | 0.22 | 0.1034 | 0.6042 | 0.2708 | 0.0594 | 0.1484 | 0.1965 | 0.0753 | 0.1095 | 0.1008 | 0.0172 | 0.9782 | 0.7125 | 0.0172 | 0.0806 | 0.0345 | 0.4583 | 0.0417 | 0.0094 | 0.0882 | 0.0853 | 0.0 | 0.0125 | 0.2040 | 0.0887 | 0.9876 | 0.8323 | 0.0203 | 0.22 | 0.1034 | 0.6042 | 0.2708 | 0.0605 | 0.1509 | 0.2001 | 0.0757 | 0.1118 | REG OPER ACCOUNT | BLOCK OF FLATS | 0.2898 | STONE, BRICK | NO | -273.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307507 | 456252 | 0 | CASH LOANS | FEMALE | NO | YES | 0 | 72000.0 | 269550.0 | 12001.5 | 225000.0 | UNACCOMPANIED | PENSIONER | SECONDARY / SECONDARY SPECIAL | WIDOW | HOUSE / APARTMENT | 0.025164 | -20775 | 365243 | -4388.0 | -4090 | NaN | 1 | 0 | 0 | 1 | 1 | 0 | NaN | 1.0 | 2 | 2 | MONDAY | 8 | 0 | 0 | 0 | 0 | 0 | 0 | XNA | NaN | 0.115992 | NaN | 0.0247 | 0.0435 | 0.9727 | 0.6260 | 0.0022 | 0.00 | 0.1034 | 0.0833 | 0.1250 | 0.0579 | 0.0202 | 0.0257 | 0.0000 | 0.0000 | 0.0252 | 0.0451 | 0.9727 | 0.6406 | 0.0022 | 0.0000 | 0.1034 | 0.0833 | 0.1250 | 0.0592 | 0.0220 | 0.0267 | 0.0 | 0.0000 | 0.0250 | 0.0435 | 0.9727 | 0.6310 | 0.0022 | 0.00 | 0.1034 | 0.0833 | 0.1250 | 0.0589 | 0.0205 | 0.0261 | 0.0000 | 0.0000 | REG OPER ACCOUNT | BLOCK OF FLATS | 0.0214 | STONE, BRICK | NO | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307508 | 456253 | 0 | CASH LOANS | FEMALE | NO | YES | 0 | 153000.0 | 677664.0 | 29979.0 | 585000.0 | UNACCOMPANIED | WORKING | HIGHER EDUCATION | SEPARATED | HOUSE / APARTMENT | 0.005002 | -14966 | -7921 | -6737.0 | -5150 | NaN | 1 | 1 | 0 | 1 | 0 | 1 | MANAGERS | 1.0 | 3 | 3 | THURSDAY | 9 | 0 | 0 | 0 | 0 | 1 | 1 | SCHOOL | 0.744026 | 0.535722 | 0.218859 | 0.1031 | 0.0862 | 0.9816 | 0.7484 | 0.0123 | 0.00 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0841 | 0.9279 | 0.0000 | 0.0000 | 0.1050 | 0.0894 | 0.9816 | 0.7583 | 0.0124 | 0.0000 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0918 | 0.9667 | 0.0 | 0.0000 | 0.1041 | 0.0862 | 0.9816 | 0.7518 | 0.0124 | 0.00 | 0.2069 | 0.1667 | 0.2083 | NaN | 0.0855 | 0.9445 | 0.0000 | 0.0000 | REG OPER ACCOUNT | BLOCK OF FLATS | 0.7970 | PANEL | NO | -1909.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 307509 | 456254 | 1 | CASH LOANS | FEMALE | NO | YES | 0 | 171000.0 | 370107.0 | 20205.0 | 319500.0 | UNACCOMPANIED | COMMERCIAL ASSOCIATE | SECONDARY / SECONDARY SPECIAL | MARRIED | HOUSE / APARTMENT | 0.005313 | -11961 | -4786 | -2562.0 | -931 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | LABORERS | 2.0 | 2 | 2 | WEDNESDAY | 9 | 0 | 0 | 0 | 1 | 1 | 0 | BUSINESS ENTITY TYPE 1 | NaN | 0.514163 | 0.661024 | 0.0124 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0061 | NaN | NaN | 0.0126 | NaN | 0.9772 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0063 | NaN | NaN | 0.0125 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 | NaN | NaN | NaN | 0.0062 | NaN | NaN | NaN | BLOCK OF FLATS | 0.0086 | STONE, BRICK | NO | -322.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 307510 | 456255 | 0 | CASH LOANS | FEMALE | NO | NO | 0 | 157500.0 | 675000.0 | 49117.5 | 675000.0 | UNACCOMPANIED | COMMERCIAL ASSOCIATE | HIGHER EDUCATION | MARRIED | HOUSE / APARTMENT | 0.046220 | -16856 | -1262 | -5128.0 | -410 | NaN | 1 | 1 | 1 | 1 | 1 | 0 | LABORERS | 2.0 | 1 | 1 | THURSDAY | 20 | 0 | 0 | 0 | 0 | 1 | 1 | BUSINESS ENTITY TYPE 3 | 0.734460 | 0.708569 | 0.113922 | 0.0742 | 0.0526 | 0.9881 | NaN | 0.0176 | 0.08 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0791 | NaN | 0.0000 | 0.0756 | 0.0546 | 0.9881 | NaN | 0.0178 | 0.0806 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0824 | NaN | 0.0000 | 0.0749 | 0.0526 | 0.9881 | NaN | 0.0177 | 0.08 | 0.0690 | 0.3750 | NaN | NaN | NaN | 0.0805 | NaN | 0.0000 | NaN | BLOCK OF FLATS | 0.0718 | PANEL | NO | -787.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 |
307511 rows × 118 columns
VERIFICAR BALANCE¶
Los datos no parecen seguir un orden específico que impacte la repartición durante el proceso de división. Sin embargo, se observa un desequilibrio en las proporciones de las clases de TARGET que debe ser tratado. Para abordar el desequilibrio de clases, podríamos considerar métodos como el re-sampling, el uso de pesos de clase o realizar un muestreo estratificado para asegurar una representación equilibrada de cada clase durante el entrenamiento del modelo.
STRATIFICATION¶
Asegura que la distribución de clases de la variable objetivo se mantenga tanto en el TRAIN como en el TEST. Esto es importante cuando se trabajan con clases desequilibradas, ya que previene que el conjunto de TEST tenga una distribución desigual de las clases en comparación con el conjunto de datos original.
SEPARACION TRAIN Y TEST¶
porque permite evaluar cómo de bien el modelo generalizara a datos no vistos previamente y se adapta a nuevos datos desconocidos
#features seran (X), que son independientes y target variable sera (y): dependiente en teoria de las variables de X :
#removar nuestra variable objetivo del conjunto X porque es la que es dependiente:
X = data.drop('TARGET', axis=1)
#asignar y como variable objetivo TARGET:
y = data['TARGET']
#stratified train-test split (80% train, 20% test), enfocado en una stratificacion de la variable TARGET:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=seed
)
data_test = pd.concat([X_test, y_test],axis=1)
data_train = pd.concat([X_train, y_train],axis=1)
# verificacion de la distribucion de las clases en los diferentes conjuntos:
print("Original class distribution:")
print(y.value_counts(normalize=True)) # Proportion in original data
print("\nTrain set class distribution:")
print(y_train.value_counts(normalize=True)) # Proportion in train set
print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True)) # Proportion in test set
Original class distribution: 0 0.919271 1 0.080729 Name: TARGET, dtype: float64 Train set class distribution: 0 0.919271 1 0.080729 Name: TARGET, dtype: float64 Test set class distribution: 0 0.919272 1 0.080728 Name: TARGET, dtype: float64
Stratification permitio que las distribuciones de las clases conserven el mismo patron como lo del dataset de origen¶
- Eran alrededor del 92% para class 0 y un 8% para class 1
Visualización descriptiva de los datos¶
- valores nulos por filas
- por columnas
Por columnas (variable)¶
#cqlcular el % de missing por columna
missing_col = eda.check_missing_values(data_train)
# crear un DataFrame con las columnas ordenadas por % de missing en orden DESC:
missing_col_sorted = missing_col.sort_values(ascending=False)
#reset el index
missing_col_sorted = missing_col_sorted.reset_index()
print(missing_col_sorted)
% of NAN per column:
CODE_GENDER 0.001219
AMT_ANNUITY 0.004471
AMT_GOODS_PRICE 0.085770
NAME_TYPE_SUITE 0.407710
OWN_CAR_AGE 65.985253
OCCUPATION_TYPE 31.276219
CNT_FAM_MEMBERS 0.000813
EXT_SOURCE_1 56.310364
EXT_SOURCE_2 0.209749
EXT_SOURCE_3 19.907483
APARTMENTS_AVG 50.715424
BASEMENTAREA_AVG 58.485903
YEARS_BEGINEXPLUATATION_AVG 48.752480
YEARS_BUILD_AVG 66.479139
COMMONAREA_AVG 69.847322
ELEVATORS_AVG 53.250707
ENTRANCES_AVG 50.298771
FLOORSMAX_AVG 49.723993
FLOORSMIN_AVG 67.831534
LANDAREA_AVG 59.359452
LIVINGAPARTMENTS_AVG 68.351842
LIVINGAREA_AVG 50.143085
NONLIVINGAPARTMENTS_AVG 69.427823
NONLIVINGAREA_AVG 55.127069
APARTMENTS_MODE 50.715424
BASEMENTAREA_MODE 58.485903
YEARS_BEGINEXPLUATATION_MODE 48.752480
YEARS_BUILD_MODE 66.479139
COMMONAREA_MODE 69.847322
ELEVATORS_MODE 53.250707
ENTRANCES_MODE 50.298771
FLOORSMAX_MODE 49.723993
FLOORSMIN_MODE 67.831534
LANDAREA_MODE 59.359452
LIVINGAPARTMENTS_MODE 68.351842
LIVINGAREA_MODE 50.143085
NONLIVINGAPARTMENTS_MODE 69.427823
NONLIVINGAREA_MODE 55.127069
APARTMENTS_MEDI 50.715424
BASEMENTAREA_MEDI 58.485903
YEARS_BEGINEXPLUATATION_MEDI 48.752480
YEARS_BUILD_MEDI 66.479139
COMMONAREA_MEDI 69.847322
ELEVATORS_MEDI 53.250707
ENTRANCES_MEDI 50.298771
FLOORSMAX_MEDI 49.723993
FLOORSMIN_MEDI 67.831534
LANDAREA_MEDI 59.359452
LIVINGAPARTMENTS_MEDI 68.351842
LIVINGAREA_MEDI 50.143085
NONLIVINGAPARTMENTS_MEDI 69.427823
NONLIVINGAREA_MEDI 55.127069
FONDKAPREMONT_MODE 68.368102
HOUSETYPE_MODE 50.147556
TOTALAREA_MODE 48.227293
WALLSMATERIAL_MODE 50.813388
EMERGENCYSTATE_MODE 47.361468
AMT_REQ_CREDIT_BUREAU_HOUR 13.573136
AMT_REQ_CREDIT_BUREAU_DAY 13.573136
AMT_REQ_CREDIT_BUREAU_WEEK 13.573136
AMT_REQ_CREDIT_BUREAU_MON 13.573136
AMT_REQ_CREDIT_BUREAU_QRT 13.573136
AMT_REQ_CREDIT_BUREAU_YEAR 13.573136
dtype: float64
index 0
0 COMMONAREA_MODE 69.847322
1 COMMONAREA_MEDI 69.847322
2 COMMONAREA_AVG 69.847322
3 NONLIVINGAPARTMENTS_MODE 69.427823
4 NONLIVINGAPARTMENTS_AVG 69.427823
5 NONLIVINGAPARTMENTS_MEDI 69.427823
6 FONDKAPREMONT_MODE 68.368102
7 LIVINGAPARTMENTS_AVG 68.351842
8 LIVINGAPARTMENTS_MODE 68.351842
9 LIVINGAPARTMENTS_MEDI 68.351842
10 FLOORSMIN_MEDI 67.831534
11 FLOORSMIN_MODE 67.831534
12 FLOORSMIN_AVG 67.831534
13 YEARS_BUILD_MODE 66.479139
14 YEARS_BUILD_MEDI 66.479139
15 YEARS_BUILD_AVG 66.479139
16 OWN_CAR_AGE 65.985253
17 LANDAREA_MEDI 59.359452
18 LANDAREA_MODE 59.359452
19 LANDAREA_AVG 59.359452
20 BASEMENTAREA_MEDI 58.485903
21 BASEMENTAREA_AVG 58.485903
22 BASEMENTAREA_MODE 58.485903
23 EXT_SOURCE_1 56.310364
24 NONLIVINGAREA_MODE 55.127069
25 NONLIVINGAREA_AVG 55.127069
26 NONLIVINGAREA_MEDI 55.127069
27 ELEVATORS_AVG 53.250707
28 ELEVATORS_MEDI 53.250707
29 ELEVATORS_MODE 53.250707
30 WALLSMATERIAL_MODE 50.813388
31 APARTMENTS_AVG 50.715424
32 APARTMENTS_MODE 50.715424
33 APARTMENTS_MEDI 50.715424
34 ENTRANCES_MODE 50.298771
35 ENTRANCES_MEDI 50.298771
36 ENTRANCES_AVG 50.298771
37 HOUSETYPE_MODE 50.147556
38 LIVINGAREA_MEDI 50.143085
39 LIVINGAREA_MODE 50.143085
40 LIVINGAREA_AVG 50.143085
41 FLOORSMAX_MEDI 49.723993
42 FLOORSMAX_MODE 49.723993
43 FLOORSMAX_AVG 49.723993
44 YEARS_BEGINEXPLUATATION_AVG 48.752480
45 YEARS_BEGINEXPLUATATION_MEDI 48.752480
46 YEARS_BEGINEXPLUATATION_MODE 48.752480
47 TOTALAREA_MODE 48.227293
48 EMERGENCYSTATE_MODE 47.361468
49 OCCUPATION_TYPE 31.276219
50 EXT_SOURCE_3 19.907483
51 AMT_REQ_CREDIT_BUREAU_DAY 13.573136
52 AMT_REQ_CREDIT_BUREAU_WEEK 13.573136
53 AMT_REQ_CREDIT_BUREAU_MON 13.573136
54 AMT_REQ_CREDIT_BUREAU_QRT 13.573136
55 AMT_REQ_CREDIT_BUREAU_HOUR 13.573136
56 AMT_REQ_CREDIT_BUREAU_YEAR 13.573136
57 NAME_TYPE_SUITE 0.407710
58 EXT_SOURCE_2 0.209749
59 AMT_GOODS_PRICE 0.085770
60 AMT_ANNUITY 0.004471
61 CODE_GENDER 0.001219
62 CNT_FAM_MEMBERS 0.000813
eda.plot_missing_values(data)
#Valores faltantes por fila y con variable objetivo para cada fila
missing_row_sorted = eda.check_missing_per_row(data_train)
#Visualizacion del dataframe :
missing_row_sorted
| missing_percentage | TARGET | |
|---|---|---|
| 236260 | 50.000000 | 0 |
| 249616 | 50.000000 | 1 |
| 224619 | 49.152542 | 0 |
| 267335 | 49.152542 | 0 |
| 26398 | 49.152542 | 0 |
| ... | ... | ... |
| 33699 | 0.000000 | 0 |
| 239409 | 0.000000 | 0 |
| 221021 | 0.000000 | 0 |
| 167597 | 0.000000 | 0 |
| 183048 | 0.000000 | 0 |
246008 rows × 2 columns
VISUALIZATION OF CONTINUOUS AND CATEGORICAL VARIABLES¶
#llamo a la funcion de mi archivo de funciones:
#con esta funcion, pusé en una lista las categoricas: categort, object, y numericos con menos de 20 valores unicos
#y en una otra lista las variables continuas que son tipo float o int, con mas de 20 valores unicos
categorical_vars, continuous_vars = eda.dame_variables_categoricas(data_train)
print("Variables categóricas:", categorical_vars)
print("----------------------------------------------------------------")
print("Variables continuas:", continuous_vars)
Variables categóricas: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT', 'TARGET'] ---------------------------------------------------------------- Variables continuas: ['SK_ID_CURR', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'HOUR_APPR_PROCESS_START', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR']
#creo carpeta de imagenes dedicada a la EDA
os.makedirs('../images/02_notebook_images', exist_ok=True)
#llamo mi funcion para visualizar las categoricas, y respeto a la variable TARGET:
eda.plot_all_variables(data_train, categorical_vars, 'TARGET')
Analyzing NAME_CONTRACT_TYPE
Analyzing CODE_GENDER
Analyzing FLAG_OWN_CAR
Analyzing FLAG_OWN_REALTY
Analyzing CNT_CHILDREN
Analyzing NAME_TYPE_SUITE
Analyzing NAME_INCOME_TYPE
Analyzing NAME_EDUCATION_TYPE
Analyzing NAME_FAMILY_STATUS
Analyzing NAME_HOUSING_TYPE
Analyzing FLAG_MOBIL
Analyzing FLAG_EMP_PHONE
Analyzing FLAG_WORK_PHONE
Analyzing FLAG_CONT_MOBILE
Analyzing FLAG_PHONE
Analyzing FLAG_EMAIL
Analyzing OCCUPATION_TYPE
Analyzing CNT_FAM_MEMBERS
Analyzing REGION_RATING_CLIENT
Analyzing REGION_RATING_CLIENT_W_CITY
Analyzing WEEKDAY_APPR_PROCESS_START
Analyzing REG_REGION_NOT_LIVE_REGION
Analyzing REG_REGION_NOT_WORK_REGION
Analyzing LIVE_REGION_NOT_WORK_REGION
Analyzing REG_CITY_NOT_LIVE_CITY
Analyzing REG_CITY_NOT_WORK_CITY
Analyzing LIVE_CITY_NOT_WORK_CITY
Analyzing ORGANIZATION_TYPE
Analyzing FONDKAPREMONT_MODE
Analyzing HOUSETYPE_MODE
Analyzing WALLSMATERIAL_MODE
Analyzing EMERGENCYSTATE_MODE
Analyzing FLAG_DOCUMENT_2
Analyzing FLAG_DOCUMENT_3
Analyzing FLAG_DOCUMENT_4
Analyzing FLAG_DOCUMENT_5
Analyzing FLAG_DOCUMENT_6
Analyzing FLAG_DOCUMENT_7
Analyzing FLAG_DOCUMENT_8
Analyzing FLAG_DOCUMENT_9
Analyzing FLAG_DOCUMENT_10
Analyzing FLAG_DOCUMENT_11
Analyzing FLAG_DOCUMENT_12
Analyzing FLAG_DOCUMENT_13
Analyzing FLAG_DOCUMENT_14
Analyzing FLAG_DOCUMENT_15
Analyzing FLAG_DOCUMENT_16
Analyzing FLAG_DOCUMENT_17
Analyzing FLAG_DOCUMENT_18
Analyzing FLAG_DOCUMENT_19
Analyzing FLAG_DOCUMENT_20
Analyzing FLAG_DOCUMENT_21
Analyzing AMT_REQ_CREDIT_BUREAU_HOUR
Analyzing AMT_REQ_CREDIT_BUREAU_DAY
Analyzing AMT_REQ_CREDIT_BUREAU_WEEK
Analyzing AMT_REQ_CREDIT_BUREAU_QRT
Analyzing TARGET
# llamo la funcion que grafica, guarda las imagenes de mis variables continuas:
eda.plot_all_features(data_train, continuous_vars, target_col='TARGET')
Iniciando análisis de 61 características continuas Procesando lote 1 de 21
Gráfico de 'SK_ID_CURR' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'AMT_INCOME_TOTAL' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'AMT_CREDIT' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 2 de 21
Gráfico de 'AMT_ANNUITY' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'AMT_GOODS_PRICE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'REGION_POPULATION_RELATIVE' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 3 de 21
Gráfico de 'DAYS_BIRTH' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'DAYS_EMPLOYED' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'DAYS_REGISTRATION' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 4 de 21
Gráfico de 'DAYS_ID_PUBLISH' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'OWN_CAR_AGE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'HOUR_APPR_PROCESS_START' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 5 de 21
Gráfico de 'EXT_SOURCE_1' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'EXT_SOURCE_2' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'EXT_SOURCE_3' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 6 de 21
Gráfico de 'APARTMENTS_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'BASEMENTAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'YEARS_BEGINEXPLUATATION_AVG' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 7 de 21
Gráfico de 'YEARS_BUILD_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'COMMONAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'ELEVATORS_AVG' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 8 de 21
Gráfico de 'ENTRANCES_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'FLOORSMAX_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'FLOORSMIN_AVG' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 9 de 21
Gráfico de 'LANDAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'LIVINGAPARTMENTS_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'LIVINGAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 10 de 21
Gráfico de 'NONLIVINGAPARTMENTS_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'NONLIVINGAREA_AVG' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'APARTMENTS_MODE' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 11 de 21
Gráfico de 'BASEMENTAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'YEARS_BEGINEXPLUATATION_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'YEARS_BUILD_MODE' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 12 de 21
Gráfico de 'COMMONAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'ELEVATORS_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'ENTRANCES_MODE' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 13 de 21
Gráfico de 'FLOORSMAX_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'FLOORSMIN_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'LANDAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 14 de 21
Gráfico de 'LIVINGAPARTMENTS_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'LIVINGAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'NONLIVINGAPARTMENTS_MODE' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 15 de 21
Gráfico de 'NONLIVINGAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'APARTMENTS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'BASEMENTAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 16 de 21
Gráfico de 'YEARS_BEGINEXPLUATATION_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'YEARS_BUILD_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'COMMONAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 17 de 21
Gráfico de 'ELEVATORS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'ENTRANCES_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'FLOORSMAX_MEDI' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 18 de 21
Gráfico de 'FLOORSMIN_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'LANDAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'LIVINGAPARTMENTS_MEDI' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 19 de 21
Gráfico de 'LIVINGAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'NONLIVINGAPARTMENTS_MEDI' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'NONLIVINGAREA_MEDI' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 20 de 21
Gráfico de 'TOTALAREA_MODE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'DAYS_LAST_PHONE_CHANGE' guardado exitosamente en '..\images\02_notebook_images'.
Gráfico de 'AMT_REQ_CREDIT_BUREAU_MON' guardado exitosamente en '..\images\02_notebook_images'. Procesando lote 21 de 21
Gráfico de 'AMT_REQ_CREDIT_BUREAU_YEAR' guardado exitosamente en '..\images\02_notebook_images'. Proceso completado exitosamente.
Conclusión¶
Para concluir:
Vamos a destacar las tendencias observadas en la distribución de las variables en relación con TARGET para identificar aquellas que, a simple vista, parecen tener mayor influencia en la clasificación.
Factores laborales¶
- ORGANIZATION_TYPE muestra que ciertos tipos de trabajos, como limpieza, electricidad y cultura, están más asociados con la clase 1, lo que sugiere una relación entre el tipo de ámbito laboral y la propensión a retrasos.
- OCCUPATION_TYPE refuerza esta tendencia, destacando que los trabajos no calificados, como camareros, conductores, cocineros y personal de seguridad, tienen una mayor proporción en la clase 1.
Situación socioeconómica¶
- En NAME_INCOME_TYPE, los desempleados y quienes están en permiso por maternidad tienen una mayor probabilidad de retraso (clase 1), reflejando factores socioeconómicos que notamos en la vida real.
- Los propietarios de una vivienda y un coche tienden a pertenecer más a la clase 0, aunque esta diferencia no es significativa.
Factores demográficos¶
- En cuanto al género, los hombres están más representados en la clase 1, mientras que las mujeres predominan en la clase 0.
- La educación parece ser crucial: un mayor nivel educativo está asociado con la clase 0, mientras que niveles bajos se relacionan con la clase 1.
- La edad también influye: los clientes más mayores tienden a estar en la clase 0, mientras que los más jóvenes son más propensos a retrasos (clase 1).
Características del préstamo¶
- Los préstamos revolving están mayormente asociados con la clase 0, mientras que los préstamos cash predominan en la clase 1. Esto podría deberse a que los préstamos revolving reflejan una mejor gestión financiera.
- Un crédito más alto parece estar vinculado a la clase 0, posiblemente debido a filtros más estrictos para aprobar montos elevados.
- Aunque se esperaría que las annuities más altas se relacionen con la clase 1, los datos muestran que están más presentes en la clase 0, con algunos outliers en la clase 1.
Factores familiares¶
- El número de ninos impacta significativamente: a medida que aumenta, también lo hace la proporción de clientes en la clase 1. significando una mayor probabilidad de retraso.
- Sin embargo, no se observan patrones claros relacionados con el estado civil segun si el cliente que pidio el prestamo estaba casado o no.
Factores externos¶
- Las clasificaciones externas (no tenemos insights sobre como se hizo la clasificacion), como el rating de región y rating de cliente, indican que regiones y clientes de tipo 3 tienen mayor probabilidad de pertenecer a la clase 1. Sugeriendo que mas alto el rating, mas riesgos surgen
Conclusion para la variable INCOME¶
- El ingreso del cliente es una variable que considero imprescindible,
Se observa una tendencia central más alta en la clase 0, que presenta una mediana de ingresos superior. La dispersión es prácticamente igual, dado que el rango intercuartil es bastante similar en ambas clases. Aunque hay una ligera diferencia de valores extremos en la clase 0 (significando que gente con mas ingresos tiene mas probabilidad de ser de clase 0), no es lo suficientemente significativa como para concluir que las personas con retraso tienen una diferencia considerable en los ingresos en comparación con otros casos.
A continuación, se tratan los valores missing, las correlaciones de las variables continuas y los outliers¶
continuous_vars
['SK_ID_CURR', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'HOUR_APPR_PROCESS_START', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR']
OUTLIERS¶
#identificar los valores extremos para variables continuas
# basándose en un rango definido por un intervalo de confianza multiplicado
# por una desviación estándar
eda.get_deviation_of_mean_perc(data_train, continuous_vars, target='TARGET', multiplier=3)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.938095 | 0.061905 | AMT_INCOME_TOTAL | 210 | 0.000854 |
| 1 | 0.958988 | 0.041012 | AMT_CREDIT | 2609 | 0.010605 |
| 2 | 0.960951 | 0.039049 | AMT_ANNUITY | 2356 | 0.009577 |
| 3 | 0.959916 | 0.040084 | AMT_GOODS_PRICE | 3318 | 0.013487 |
| 4 | 0.959733 | 0.040267 | REGION_POPULATION_RELATIVE | 6730 | 0.027357 |
| 5 | 0.958403 | 0.041597 | DAYS_REGISTRATION | 601 | 0.002443 |
| 6 | 0.916479 | 0.083521 | OWN_CAR_AGE | 2670 | 0.010853 |
| 7 | 0.897638 | 0.102362 | HOUR_APPR_PROCESS_START | 508 | 0.002065 |
| 8 | 0.950358 | 0.049642 | APARTMENTS_AVG | 2377 | 0.009662 |
| 9 | 0.949077 | 0.050923 | BASEMENTAREA_AVG | 1571 | 0.006386 |
| 10 | 0.917431 | 0.082569 | YEARS_BEGINEXPLUATATION_AVG | 545 | 0.002215 |
| 11 | 0.921466 | 0.078534 | YEARS_BUILD_AVG | 955 | 0.003882 |
| 12 | 0.944767 | 0.055233 | COMMONAREA_AVG | 1376 | 0.005593 |
| 13 | 0.953775 | 0.046225 | ELEVATORS_AVG | 1947 | 0.007914 |
| 14 | 0.938833 | 0.061167 | ENTRANCES_AVG | 1782 | 0.007244 |
| 15 | 0.957355 | 0.042645 | FLOORSMAX_AVG | 2087 | 0.008483 |
| 16 | 0.970588 | 0.029412 | FLOORSMIN_AVG | 476 | 0.001935 |
| 17 | 0.935252 | 0.064748 | LANDAREA_AVG | 1668 | 0.006780 |
| 18 | 0.948313 | 0.051687 | LIVINGAPARTMENTS_AVG | 1393 | 0.005662 |
| 19 | 0.947904 | 0.052096 | LIVINGAREA_AVG | 2553 | 0.010378 |
| 20 | 0.932743 | 0.067257 | NONLIVINGAPARTMENTS_AVG | 565 | 0.002297 |
| 21 | 0.948454 | 0.051546 | NONLIVINGAREA_AVG | 1940 | 0.007886 |
| 22 | 0.948505 | 0.051495 | APARTMENTS_MODE | 2408 | 0.009788 |
| 23 | 0.944745 | 0.055255 | BASEMENTAREA_MODE | 1665 | 0.006768 |
| 24 | 0.915414 | 0.084586 | YEARS_BEGINEXPLUATATION_MODE | 532 | 0.002163 |
| 25 | 0.922441 | 0.077559 | YEARS_BUILD_MODE | 967 | 0.003931 |
| 26 | 0.939839 | 0.060161 | COMMONAREA_MODE | 1363 | 0.005540 |
| 27 | 0.949329 | 0.050671 | ELEVATORS_MODE | 2684 | 0.010910 |
| 28 | 0.938244 | 0.061756 | ENTRANCES_MODE | 1765 | 0.007175 |
| 29 | 0.958768 | 0.041232 | FLOORSMAX_MODE | 2110 | 0.008577 |
| 30 | 0.971429 | 0.028571 | FLOORSMIN_MODE | 385 | 0.001565 |
| 31 | 0.935972 | 0.064028 | LANDAREA_MODE | 1718 | 0.006984 |
| 32 | 0.944095 | 0.055905 | LIVINGAPARTMENTS_MODE | 1431 | 0.005817 |
| 33 | 0.945673 | 0.054327 | LIVINGAREA_MODE | 2669 | 0.010849 |
| 34 | 0.926554 | 0.073446 | NONLIVINGAPARTMENTS_MODE | 531 | 0.002158 |
| 35 | 0.949058 | 0.050942 | NONLIVINGAREA_MODE | 1963 | 0.007979 |
| 36 | 0.949979 | 0.050021 | APARTMENTS_MEDI | 2419 | 0.009833 |
| 37 | 0.947937 | 0.052063 | BASEMENTAREA_MEDI | 1575 | 0.006402 |
| 38 | 0.913386 | 0.086614 | YEARS_BEGINEXPLUATATION_MEDI | 508 | 0.002065 |
| 39 | 0.922280 | 0.077720 | YEARS_BUILD_MEDI | 965 | 0.003923 |
| 40 | 0.943844 | 0.056156 | COMMONAREA_MEDI | 1389 | 0.005646 |
| 41 | 0.953981 | 0.046019 | ELEVATORS_MEDI | 1934 | 0.007862 |
| 42 | 0.938582 | 0.061418 | ENTRANCES_MEDI | 1791 | 0.007280 |
| 43 | 0.957515 | 0.042485 | FLOORSMAX_MEDI | 2189 | 0.008898 |
| 44 | 0.970852 | 0.029148 | FLOORSMIN_MEDI | 446 | 0.001813 |
| 45 | 0.938596 | 0.061404 | LANDAREA_MEDI | 1710 | 0.006951 |
| 46 | 0.947820 | 0.052180 | LIVINGAPARTMENTS_MEDI | 1399 | 0.005687 |
| 47 | 0.948968 | 0.051032 | LIVINGAREA_MEDI | 2567 | 0.010435 |
| 48 | 0.930728 | 0.069272 | NONLIVINGAPARTMENTS_MEDI | 563 | 0.002289 |
| 49 | 0.948692 | 0.051308 | NONLIVINGAREA_MEDI | 1949 | 0.007923 |
| 50 | 0.956044 | 0.043956 | TOTALAREA_MODE | 2639 | 0.010727 |
| 51 | 0.965974 | 0.034026 | DAYS_LAST_PHONE_CHANGE | 529 | 0.002150 |
| 52 | 0.948478 | 0.051522 | AMT_REQ_CREDIT_BUREAU_MON | 2562 | 0.010414 |
| 53 | 0.911004 | 0.088996 | AMT_REQ_CREDIT_BUREAU_YEAR | 2708 | 0.011008 |
El método de la desviación estándar es un método clásico que he utilizado y que parece adecuado para la distribución de mis datos. Me parece confiable, no decidi explorar métodos alternativos para obtener una mayor precisión.
Para concluir el analisis de valores extremos continuos:¶
Una proporción tan baja de valores atípicos globalmente significa que su impacto global en la distribución probablemente sea limitado.
Además, en variables importantes como el ingreso (income), el número de valores atípicos es extremadamente bajo.
No es indispensable tratarlos de momento, pero quedo atenta a la eleccion del modelo para ver si se debe ajustar la precision y si puede tener una importancia adaptarlos si el modelo lo pide.
Se podria utilizar métodos como capping, la imputación con la mediana o la media, las transformaciones matemáticas (logaritmo, raíz cuadrada), o removarlos...
CORRELATION CATEGORICAL AND NUMERICAL SEPARATELY¶
- MATRIZ Y COEF. DE CORRELACION
eda.get_corr_matrix(dataset = data_train[continuous_vars],
metodo='pearson', size_figure=[10,8])
0
corr = data_train[continuous_vars].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.55]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 2824 | YEARS_BUILD_MEDI | YEARS_BUILD_AVG | 0.998419 |
| 3134 | FLOORSMIN_MEDI | FLOORSMIN_AVG | 0.997352 |
| 3072 | FLOORSMAX_MEDI | FLOORSMAX_AVG | 0.997141 |
| 3010 | ENTRANCES_MEDI | ENTRANCES_AVG | 0.996948 |
| 2948 | ELEVATORS_MEDI | ELEVATORS_AVG | 0.996008 |
| 2886 | COMMONAREA_MEDI | COMMONAREA_AVG | 0.995816 |
| 3320 | LIVINGAREA_MEDI | LIVINGAREA_AVG | 0.995450 |
| 2638 | APARTMENTS_MEDI | APARTMENTS_AVG | 0.995270 |
| 3258 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.994466 |
| 2700 | BASEMENTAREA_MEDI | BASEMENTAREA_AVG | 0.994035 |
| 2762 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_AVG | 0.993125 |
| 3444 | NONLIVINGAREA_MEDI | NONLIVINGAREA_AVG | 0.991649 |
| 3196 | LANDAREA_MEDI | LANDAREA_AVG | 0.991599 |
| 3382 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_AVG | 0.990498 |
| 1970 | YEARS_BUILD_MODE | YEARS_BUILD_AVG | 0.989127 |
| 2838 | YEARS_BUILD_MEDI | YEARS_BUILD_MODE | 0.989118 |
| 3148 | FLOORSMIN_MEDI | FLOORSMIN_MODE | 0.988433 |
| 3086 | FLOORSMAX_MEDI | FLOORSMAX_MODE | 0.988204 |
| 246 | AMT_GOODS_PRICE | AMT_CREDIT | 0.986997 |
| 2280 | FLOORSMIN_MODE | FLOORSMIN_AVG | 0.986046 |
| 2218 | FLOORSMAX_MODE | FLOORSMAX_AVG | 0.985710 |
| 2962 | ELEVATORS_MEDI | ELEVATORS_MODE | 0.982707 |
| 3210 | LANDAREA_MEDI | LANDAREA_MODE | 0.980788 |
| 3024 | ENTRANCES_MEDI | ENTRANCES_MODE | 0.980273 |
| 2900 | COMMONAREA_MEDI | COMMONAREA_MODE | 0.979032 |
| 2094 | ELEVATORS_MODE | ELEVATORS_AVG | 0.978604 |
| 2156 | ENTRANCES_MODE | ENTRANCES_AVG | 0.977388 |
| 2652 | APARTMENTS_MEDI | APARTMENTS_MODE | 0.977085 |
| 3396 | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAPARTMENTS_MODE | 0.977053 |
| 2714 | BASEMENTAREA_MEDI | BASEMENTAREA_MODE | 0.976871 |
| 2032 | COMMONAREA_MODE | COMMONAREA_AVG | 0.976245 |
| 3272 | LIVINGAPARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.975746 |
| 3458 | NONLIVINGAREA_MEDI | NONLIVINGAREA_MODE | 0.975426 |
| 3334 | LIVINGAREA_MEDI | LIVINGAREA_MODE | 0.974965 |
| 1784 | APARTMENTS_MODE | APARTMENTS_AVG | 0.973294 |
| 2342 | LANDAREA_MODE | LANDAREA_AVG | 0.973156 |
| 2466 | LIVINGAREA_MODE | LIVINGAREA_AVG | 0.972183 |
| 1846 | BASEMENTAREA_MODE | BASEMENTAREA_AVG | 0.972080 |
| 2404 | LIVINGAPARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.970693 |
| 1908 | YEARS_BEGINEXPLUATATION_MODE | YEARS_BEGINEXPLUATATION_AVG | 0.970327 |
| 2528 | NONLIVINGAPARTMENTS_MODE | NONLIVINGAPARTMENTS_AVG | 0.967621 |
| 2590 | NONLIVINGAREA_MODE | NONLIVINGAREA_AVG | 0.967063 |
| 2776 | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BEGINEXPLUATATION_MODE | 0.960662 |
| 1540 | LIVINGAPARTMENTS_AVG | APARTMENTS_AVG | 0.944185 |
| 3248 | LIVINGAPARTMENTS_MEDI | APARTMENTS_AVG | 0.942998 |
| 3276 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MEDI | 0.942776 |
| 2408 | LIVINGAPARTMENTS_MODE | APARTMENTS_MODE | 0.939285 |
| 2648 | APARTMENTS_MEDI | LIVINGAPARTMENTS_AVG | 0.936461 |
| 2662 | APARTMENTS_MEDI | LIVINGAPARTMENTS_MODE | 0.933201 |
| 2394 | LIVINGAPARTMENTS_MODE | APARTMENTS_AVG | 0.931936 |
| 3503 | TOTALAREA_MODE | LIVINGAREA_AVG | 0.925681 |
| 3531 | TOTALAREA_MODE | LIVINGAREA_MEDI | 0.920420 |
| 3337 | LIVINGAREA_MEDI | APARTMENTS_MEDI | 0.917647 |
| 3262 | LIVINGAPARTMENTS_MEDI | APARTMENTS_MODE | 0.915566 |
| 1601 | LIVINGAREA_AVG | APARTMENTS_AVG | 0.915396 |
| 3309 | LIVINGAREA_MEDI | APARTMENTS_AVG | 0.914270 |
| 2649 | APARTMENTS_MEDI | LIVINGAREA_AVG | 0.914240 |
| 2469 | LIVINGAREA_MODE | APARTMENTS_MODE | 0.912045 |
| 1794 | APARTMENTS_MODE | LIVINGAPARTMENTS_AVG | 0.909813 |
| 3517 | TOTALAREA_MODE | LIVINGAREA_MODE | 0.900182 |
| 2663 | APARTMENTS_MEDI | LIVINGAREA_MODE | 0.897528 |
| 3323 | LIVINGAREA_MEDI | APARTMENTS_MODE | 0.895802 |
| 2455 | LIVINGAREA_MODE | APARTMENTS_AVG | 0.895123 |
| 3492 | TOTALAREA_MODE | APARTMENTS_AVG | 0.893126 |
| 1795 | APARTMENTS_MODE | LIVINGAREA_AVG | 0.892349 |
| 3520 | TOTALAREA_MODE | APARTMENTS_MEDI | 0.887156 |
| 3347 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MEDI | 0.884883 |
| 3259 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_AVG | 0.883318 |
| 1611 | LIVINGAREA_AVG | LIVINGAPARTMENTS_AVG | 0.881072 |
| 2479 | LIVINGAREA_MODE | LIVINGAPARTMENTS_MODE | 0.879649 |
| 3319 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.879001 |
| 3333 | LIVINGAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.874785 |
| 2405 | LIVINGAPARTMENTS_MODE | LIVINGAREA_AVG | 0.873256 |
| 3342 | LIVINGAREA_MEDI | ELEVATORS_MEDI | 0.869387 |
| 1606 | LIVINGAREA_AVG | ELEVATORS_AVG | 0.868331 |
| 2954 | ELEVATORS_MEDI | LIVINGAREA_AVG | 0.866723 |
| 3314 | LIVINGAREA_MEDI | ELEVATORS_AVG | 0.866426 |
| 3506 | TOTALAREA_MODE | APARTMENTS_MODE | 0.864362 |
| 3273 | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MODE | 0.858390 |
| 2474 | LIVINGAREA_MODE | ELEVATORS_MODE | 0.856916 |
| 3328 | LIVINGAREA_MEDI | ELEVATORS_MODE | 0.856738 |
| 2100 | ELEVATORS_MODE | LIVINGAREA_AVG | 0.853372 |
| 2465 | LIVINGAREA_MODE | LIVINGAPARTMENTS_AVG | 0.853118 |
| 3502 | TOTALAREA_MODE | LIVINGAPARTMENTS_AVG | 0.849229 |
| 3530 | TOTALAREA_MODE | LIVINGAPARTMENTS_MEDI | 0.847481 |
| 3497 | TOTALAREA_MODE | ELEVATORS_AVG | 0.843365 |
| 2968 | ELEVATORS_MEDI | LIVINGAREA_MODE | 0.841639 |
| 2460 | LIVINGAREA_MODE | ELEVATORS_AVG | 0.839344 |
| 3525 | TOTALAREA_MODE | ELEVATORS_MEDI | 0.837275 |
| 2971 | ELEVATORS_MEDI | APARTMENTS_MEDI | 0.836541 |
| 1235 | ELEVATORS_AVG | APARTMENTS_AVG | 0.836042 |
| 3516 | TOTALAREA_MODE | LIVINGAPARTMENTS_MODE | 0.835947 |
| 2943 | ELEVATORS_MEDI | APARTMENTS_AVG | 0.834482 |
| 2643 | APARTMENTS_MEDI | ELEVATORS_AVG | 0.833536 |
| 2103 | ELEVATORS_MODE | APARTMENTS_MODE | 0.825541 |
| 2657 | APARTMENTS_MEDI | ELEVATORS_MODE | 0.824718 |
| 2089 | ELEVATORS_MODE | APARTMENTS_AVG | 0.821834 |
| 3511 | TOTALAREA_MODE | ELEVATORS_MODE | 0.820205 |
| 3281 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MEDI | 0.812438 |
| 3253 | LIVINGAPARTMENTS_MEDI | ELEVATORS_AVG | 0.810733 |
| 1545 | LIVINGAPARTMENTS_AVG | ELEVATORS_AVG | 0.810014 |
| 2953 | ELEVATORS_MEDI | LIVINGAPARTMENTS_AVG | 0.807994 |
| 2957 | ELEVATORS_MEDI | APARTMENTS_MODE | 0.807919 |
| 2413 | LIVINGAPARTMENTS_MODE | ELEVATORS_MODE | 0.806637 |
| 1789 | APARTMENTS_MODE | ELEVATORS_AVG | 0.804933 |
| 3267 | LIVINGAPARTMENTS_MEDI | ELEVATORS_MODE | 0.798001 |
| 2967 | ELEVATORS_MEDI | LIVINGAPARTMENTS_MODE | 0.797162 |
| 2399 | LIVINGAPARTMENTS_MODE | ELEVATORS_AVG | 0.795324 |
| 2099 | ELEVATORS_MODE | LIVINGAPARTMENTS_AVG | 0.793141 |
| 247 | AMT_GOODS_PRICE | AMT_ANNUITY | 0.775701 |
| 185 | AMT_ANNUITY | AMT_CREDIT | 0.770691 |
| 1425 | FLOORSMIN_AVG | FLOORSMAX_AVG | 0.742447 |
| 3161 | FLOORSMIN_MEDI | FLOORSMAX_MEDI | 0.740578 |
| 3133 | FLOORSMIN_MEDI | FLOORSMAX_AVG | 0.740053 |
| 3073 | FLOORSMAX_MEDI | FLOORSMIN_AVG | 0.740023 |
| 3147 | FLOORSMIN_MEDI | FLOORSMAX_MODE | 0.730022 |
| 2219 | FLOORSMAX_MODE | FLOORSMIN_AVG | 0.729350 |
| 2293 | FLOORSMIN_MODE | FLOORSMAX_MODE | 0.727067 |
| 3087 | FLOORSMAX_MEDI | FLOORSMIN_MODE | 0.723708 |
| 2279 | FLOORSMIN_MODE | FLOORSMAX_AVG | 0.722966 |
| 1602 | LIVINGAREA_AVG | BASEMENTAREA_AVG | 0.695154 |
| 2710 | BASEMENTAREA_MEDI | LIVINGAREA_AVG | 0.695129 |
| 3338 | LIVINGAREA_MEDI | BASEMENTAREA_MEDI | 0.694073 |
| 2470 | LIVINGAREA_MODE | BASEMENTAREA_MODE | 0.693588 |
| 3310 | LIVINGAREA_MEDI | BASEMENTAREA_AVG | 0.691838 |
| 2724 | BASEMENTAREA_MEDI | LIVINGAREA_MODE | 0.683185 |
| 2727 | BASEMENTAREA_MEDI | APARTMENTS_MEDI | 0.682731 |
| 991 | BASEMENTAREA_AVG | APARTMENTS_AVG | 0.681712 |
| 2699 | BASEMENTAREA_MEDI | APARTMENTS_AVG | 0.681551 |
| 1859 | BASEMENTAREA_MODE | APARTMENTS_MODE | 0.681342 |
| 2639 | APARTMENTS_MEDI | BASEMENTAREA_AVG | 0.680320 |
| 1362 | FLOORSMAX_AVG | ELEVATORS_AVG | 0.680080 |
| 2456 | LIVINGAREA_MODE | BASEMENTAREA_AVG | 0.679499 |
| 3070 | FLOORSMAX_MEDI | ELEVATORS_AVG | 0.677748 |
| 1856 | BASEMENTAREA_MODE | LIVINGAREA_AVG | 0.676864 |
| 2950 | ELEVATORS_MEDI | FLOORSMAX_AVG | 0.676434 |
| 3324 | LIVINGAREA_MEDI | BASEMENTAREA_MODE | 0.676385 |
| 3098 | FLOORSMAX_MEDI | ELEVATORS_MEDI | 0.675868 |
| 3493 | TOTALAREA_MODE | BASEMENTAREA_AVG | 0.673406 |
| 2216 | FLOORSMAX_MODE | ELEVATORS_AVG | 0.670997 |
| 3521 | TOTALAREA_MODE | BASEMENTAREA_MEDI | 0.670901 |
| 2713 | BASEMENTAREA_MEDI | APARTMENTS_MODE | 0.670827 |
| 2964 | ELEVATORS_MEDI | FLOORSMAX_MODE | 0.669300 |
| 1785 | APARTMENTS_MODE | BASEMENTAREA_AVG | 0.667797 |
| 2653 | APARTMENTS_MEDI | BASEMENTAREA_MODE | 0.666497 |
| 1845 | BASEMENTAREA_MODE | APARTMENTS_AVG | 0.664205 |
| 2230 | FLOORSMAX_MODE | ELEVATORS_MODE | 0.661472 |
| 2409 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_MODE | 0.657269 |
| 2165 | ENTRANCES_MODE | BASEMENTAREA_MODE | 0.656745 |
| 2096 | ELEVATORS_MODE | FLOORSMAX_AVG | 0.656743 |
| 3084 | FLOORSMAX_MEDI | ELEVATORS_MODE | 0.656085 |
| 1851 | BASEMENTAREA_MODE | ENTRANCES_AVG | 0.655926 |
| 2705 | BASEMENTAREA_MEDI | ENTRANCES_AVG | 0.655063 |
| 3019 | ENTRANCES_MEDI | BASEMENTAREA_MODE | 0.654844 |
| 2723 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_MODE | 0.653997 |
| 3277 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MEDI | 0.653879 |
| 3033 | ENTRANCES_MEDI | BASEMENTAREA_MEDI | 0.653525 |
| 1297 | ENTRANCES_AVG | BASEMENTAREA_AVG | 0.652845 |
| 3249 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_AVG | 0.651098 |
| 3507 | TOTALAREA_MODE | BASEMENTAREA_MODE | 0.650905 |
| 2709 | BASEMENTAREA_MEDI | LIVINGAPARTMENTS_AVG | 0.649699 |
| 2395 | LIVINGAPARTMENTS_MODE | BASEMENTAREA_AVG | 0.649393 |
| 1541 | LIVINGAPARTMENTS_AVG | BASEMENTAREA_AVG | 0.649292 |
| 3005 | ENTRANCES_MEDI | BASEMENTAREA_AVG | 0.648754 |
| 2719 | BASEMENTAREA_MEDI | ENTRANCES_MODE | 0.633048 |
| 3263 | LIVINGAPARTMENTS_MEDI | BASEMENTAREA_MODE | 0.632473 |
| 3499 | TOTALAREA_MODE | FLOORSMAX_AVG | 0.632121 |
| 1608 | LIVINGAREA_AVG | FLOORSMAX_AVG | 0.631623 |
| 3076 | FLOORSMAX_MEDI | LIVINGAREA_AVG | 0.629591 |
| 3527 | TOTALAREA_MODE | FLOORSMAX_MEDI | 0.629423 |
| 3316 | LIVINGAREA_MEDI | FLOORSMAX_AVG | 0.628424 |
| 1855 | BASEMENTAREA_MODE | LIVINGAPARTMENTS_AVG | 0.628040 |
| 2151 | ENTRANCES_MODE | BASEMENTAREA_AVG | 0.627817 |
| 3344 | LIVINGAREA_MEDI | FLOORSMAX_MEDI | 0.627624 |
| 2222 | FLOORSMAX_MODE | LIVINGAREA_AVG | 0.627241 |
| 3330 | LIVINGAREA_MEDI | FLOORSMAX_MODE | 0.625625 |
| 3513 | TOTALAREA_MODE | FLOORSMAX_MODE | 0.624915 |
| 2475 | LIVINGAREA_MODE | ENTRANCES_MODE | 0.621397 |
| 2461 | LIVINGAREA_MODE | ENTRANCES_AVG | 0.621177 |
| 3029 | ENTRANCES_MEDI | LIVINGAREA_MODE | 0.620609 |
| 1357 | FLOORSMAX_AVG | APARTMENTS_AVG | 0.618742 |
| 3315 | LIVINGAREA_MEDI | ENTRANCES_AVG | 0.618092 |
| 3343 | LIVINGAREA_MEDI | ENTRANCES_MEDI | 0.617806 |
| 1607 | LIVINGAREA_AVG | ENTRANCES_AVG | 0.617606 |
| 3065 | FLOORSMAX_MEDI | APARTMENTS_AVG | 0.616645 |
| 433 | DAYS_EMPLOYED | DAYS_BIRTH | 0.615939 |
| 2645 | APARTMENTS_MEDI | FLOORSMAX_AVG | 0.615532 |
| 2164 | ENTRANCES_MODE | APARTMENTS_MODE | 0.614903 |
| 3093 | FLOORSMAX_MEDI | APARTMENTS_MEDI | 0.614634 |
| 2211 | FLOORSMAX_MODE | APARTMENTS_AVG | 0.614561 |
| 3015 | ENTRANCES_MEDI | LIVINGAREA_AVG | 0.613990 |
| 2659 | APARTMENTS_MEDI | FLOORSMAX_MODE | 0.612911 |
| 3018 | ENTRANCES_MEDI | APARTMENTS_MODE | 0.611534 |
| 1790 | APARTMENTS_MODE | ENTRANCES_AVG | 0.611277 |
| 2644 | APARTMENTS_MEDI | ENTRANCES_AVG | 0.610779 |
| 1296 | ENTRANCES_AVG | APARTMENTS_AVG | 0.610692 |
| 3032 | ENTRANCES_MEDI | APARTMENTS_MEDI | 0.610665 |
| 2476 | LIVINGAREA_MODE | FLOORSMAX_MODE | 0.607347 |
| 3004 | ENTRANCES_MEDI | APARTMENTS_AVG | 0.607231 |
| 738 | EXT_SOURCE_1 | DAYS_BIRTH | 0.600492 |
| 2462 | LIVINGAREA_MODE | FLOORSMAX_AVG | 0.598360 |
| 3090 | FLOORSMAX_MEDI | LIVINGAREA_MODE | 0.597595 |
| 2225 | FLOORSMAX_MODE | APARTMENTS_MODE | 0.596504 |
| 3498 | TOTALAREA_MODE | ENTRANCES_AVG | 0.593474 |
| 3329 | LIVINGAREA_MEDI | ENTRANCES_MODE | 0.593359 |
| 1547 | LIVINGAPARTMENTS_AVG | FLOORSMAX_AVG | 0.589843 |
| 3255 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_AVG | 0.589165 |
| 2161 | ENTRANCES_MODE | LIVINGAREA_AVG | 0.588945 |
| 3526 | TOTALAREA_MODE | ENTRANCES_MEDI | 0.587157 |
| 2658 | APARTMENTS_MEDI | ENTRANCES_MODE | 0.586964 |
| 3283 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_MEDI | 0.586745 |
| 3075 | FLOORSMAX_MEDI | LIVINGAPARTMENTS_AVG | 0.586583 |
| 1791 | APARTMENTS_MODE | FLOORSMAX_AVG | 0.586504 |
| 3079 | FLOORSMAX_MEDI | APARTMENTS_MODE | 0.585628 |
| 3269 | LIVINGAPARTMENTS_MEDI | FLOORSMAX_MODE | 0.583228 |
| 2221 | FLOORSMAX_MODE | LIVINGAPARTMENTS_AVG | 0.582849 |
| 2150 | ENTRANCES_MODE | APARTMENTS_AVG | 0.582479 |
| 2400 | LIVINGAPARTMENTS_MODE | ENTRANCES_AVG | 0.573465 |
| 2415 | LIVINGAPARTMENTS_MODE | FLOORSMAX_MODE | 0.572674 |
| 3028 | ENTRANCES_MEDI | LIVINGAPARTMENTS_MODE | 0.572349 |
| 2401 | LIVINGAPARTMENTS_MODE | FLOORSMAX_AVG | 0.568441 |
| 3089 | FLOORSMAX_MEDI | LIVINGAPARTMENTS_MODE | 0.566405 |
| 3254 | LIVINGAPARTMENTS_MEDI | ENTRANCES_AVG | 0.566080 |
| 2414 | LIVINGAPARTMENTS_MODE | ENTRANCES_MODE | 0.565401 |
| 1236 | ELEVATORS_AVG | BASEMENTAREA_AVG | 0.564902 |
| 3282 | LIVINGAPARTMENTS_MEDI | ENTRANCES_MEDI | 0.564775 |
| 1546 | LIVINGAPARTMENTS_AVG | ENTRANCES_AVG | 0.563805 |
| 2972 | ELEVATORS_MEDI | BASEMENTAREA_MEDI | 0.563126 |
| 2704 | BASEMENTAREA_MEDI | ELEVATORS_AVG | 0.563083 |
| 2944 | ELEVATORS_MEDI | BASEMENTAREA_AVG | 0.562466 |
| 3014 | ENTRANCES_MEDI | LIVINGAPARTMENTS_AVG | 0.559816 |
| 3512 | TOTALAREA_MODE | ENTRANCES_MODE | 0.559791 |
| 2718 | BASEMENTAREA_MEDI | ELEVATORS_MODE | 0.556170 |
| 2090 | ELEVATORS_MODE | BASEMENTAREA_AVG | 0.555015 |
| 2104 | ELEVATORS_MODE | BASEMENTAREA_MODE | 0.554090 |
| 3496 | TOTALAREA_MODE | COMMONAREA_AVG | 0.553295 |
| 3524 | TOTALAREA_MODE | COMMONAREA_MEDI | 0.552846 |
| 3341 | LIVINGAREA_MEDI | COMMONAREA_MEDI | 0.550937 |
| 2893 | COMMONAREA_MEDI | LIVINGAREA_AVG | 0.550281 |
Conclusion :¶
Si dos variables están fuertemente correlacionadas, esto puede indicar multicolinealidad, donde las variables predicen esencialmente la misma información.
Aqui vemos en orden las variables mas correladas entre si, se nota que en la mayoria de los casos son variables de 'tipo de vivienda' que aportan detalles sobre el hogar, lo que tiene sentido porque son variables relacionadas entre si y muy detalladas, que mas o menos describen con detalle la vivienda.
- Otras variables relacionadas son AMT_ANNUITY y AMT_CREDIT, y AMT_ANNUITY y AMT_GOODS_PRICE que van relacionando por proporcionalidad, porque AMT_ANNUITY es proporcional a AMT_CREDIT, ya que una cuota mensual más alta está típicamente asociada a un monto de crédito más alto.
Se podria, segun del modelo, considerar removar algunas, o juntarles en una nueva variable para reducir el numero de variables y la multicolinealidad. Se puede tambien considerar usar el PCA...
MISSING VALUES¶
eda.get_percent_null_values_target(data_train, continuous_vars, target='TARGET')
| Category_0 | variable | sum_null_values | porcentaje_sum_null_values | Category_1 | |
|---|---|---|---|---|---|
| 0 | 1.000000 | AMT_ANNUITY | 11 | 0.000045 | NaN |
| 1 | 0.919431 | AMT_GOODS_PRICE | 211 | 0.000858 | 0.080569 |
| 2 | 0.914944 | OWN_CAR_AGE | 162329 | 0.659853 | 0.085056 |
| 3 | 0.914595 | EXT_SOURCE_1 | 138528 | 0.563104 | 0.085405 |
| 4 | 0.914729 | EXT_SOURCE_2 | 516 | 0.002097 | 0.085271 |
| 5 | 0.907155 | EXT_SOURCE_3 | 48974 | 0.199075 | 0.092845 |
| 6 | 0.908756 | APARTMENTS_AVG | 124764 | 0.507154 | 0.091244 |
| 7 | 0.911016 | BASEMENTAREA_AVG | 143880 | 0.584859 | 0.088984 |
| 8 | 0.908192 | YEARS_BEGINEXPLUATATION_AVG | 119935 | 0.487525 | 0.091808 |
| 9 | 0.913332 | YEARS_BUILD_AVG | 163544 | 0.664791 | 0.086668 |
| 10 | 0.914398 | COMMONAREA_AVG | 171830 | 0.698473 | 0.085602 |
| 11 | 0.909222 | ELEVATORS_AVG | 131001 | 0.532507 | 0.090778 |
| 12 | 0.908444 | ENTRANCES_AVG | 123739 | 0.502988 | 0.091556 |
| 13 | 0.908375 | FLOORSMAX_AVG | 122325 | 0.497240 | 0.091625 |
| 14 | 0.913850 | FLOORSMIN_AVG | 166871 | 0.678315 | 0.086150 |
| 15 | 0.911949 | LANDAREA_AVG | 146029 | 0.593595 | 0.088051 |
| 16 | 0.913899 | LIVINGAPARTMENTS_AVG | 168151 | 0.683518 | 0.086101 |
| 17 | 0.908792 | LIVINGAREA_AVG | 123356 | 0.501431 | 0.091208 |
| 18 | 0.914308 | NONLIVINGAPARTMENTS_AVG | 170798 | 0.694278 | 0.085692 |
| 19 | 0.909864 | NONLIVINGAREA_AVG | 135617 | 0.551271 | 0.090136 |
| 20 | 0.908756 | APARTMENTS_MODE | 124764 | 0.507154 | 0.091244 |
| 21 | 0.911016 | BASEMENTAREA_MODE | 143880 | 0.584859 | 0.088984 |
| 22 | 0.908192 | YEARS_BEGINEXPLUATATION_MODE | 119935 | 0.487525 | 0.091808 |
| 23 | 0.913332 | YEARS_BUILD_MODE | 163544 | 0.664791 | 0.086668 |
| 24 | 0.914398 | COMMONAREA_MODE | 171830 | 0.698473 | 0.085602 |
| 25 | 0.909222 | ELEVATORS_MODE | 131001 | 0.532507 | 0.090778 |
| 26 | 0.908444 | ENTRANCES_MODE | 123739 | 0.502988 | 0.091556 |
| 27 | 0.908375 | FLOORSMAX_MODE | 122325 | 0.497240 | 0.091625 |
| 28 | 0.913850 | FLOORSMIN_MODE | 166871 | 0.678315 | 0.086150 |
| 29 | 0.911949 | LANDAREA_MODE | 146029 | 0.593595 | 0.088051 |
| 30 | 0.913899 | LIVINGAPARTMENTS_MODE | 168151 | 0.683518 | 0.086101 |
| 31 | 0.908792 | LIVINGAREA_MODE | 123356 | 0.501431 | 0.091208 |
| 32 | 0.914308 | NONLIVINGAPARTMENTS_MODE | 170798 | 0.694278 | 0.085692 |
| 33 | 0.909864 | NONLIVINGAREA_MODE | 135617 | 0.551271 | 0.090136 |
| 34 | 0.908756 | APARTMENTS_MEDI | 124764 | 0.507154 | 0.091244 |
| 35 | 0.911016 | BASEMENTAREA_MEDI | 143880 | 0.584859 | 0.088984 |
| 36 | 0.908192 | YEARS_BEGINEXPLUATATION_MEDI | 119935 | 0.487525 | 0.091808 |
| 37 | 0.913332 | YEARS_BUILD_MEDI | 163544 | 0.664791 | 0.086668 |
| 38 | 0.914398 | COMMONAREA_MEDI | 171830 | 0.698473 | 0.085602 |
| 39 | 0.909222 | ELEVATORS_MEDI | 131001 | 0.532507 | 0.090778 |
| 40 | 0.908444 | ENTRANCES_MEDI | 123739 | 0.502988 | 0.091556 |
| 41 | 0.908375 | FLOORSMAX_MEDI | 122325 | 0.497240 | 0.091625 |
| 42 | 0.913850 | FLOORSMIN_MEDI | 166871 | 0.678315 | 0.086150 |
| 43 | 0.911949 | LANDAREA_MEDI | 146029 | 0.593595 | 0.088051 |
| 44 | 0.913899 | LIVINGAPARTMENTS_MEDI | 168151 | 0.683518 | 0.086101 |
| 45 | 0.908792 | LIVINGAREA_MEDI | 123356 | 0.501431 | 0.091208 |
| 46 | 0.914308 | NONLIVINGAPARTMENTS_MEDI | 170798 | 0.694278 | 0.085692 |
| 47 | 0.909864 | NONLIVINGAREA_MEDI | 135617 | 0.551271 | 0.090136 |
| 48 | 0.907951 | TOTALAREA_MODE | 118643 | 0.482273 | 0.092049 |
| 49 | 0.896709 | AMT_REQ_CREDIT_BUREAU_MON | 33391 | 0.135731 | 0.103291 |
| 50 | 0.896709 | AMT_REQ_CREDIT_BUREAU_YEAR | 33391 | 0.135731 | 0.103291 |
Si una variable importante tiene muchos valores faltantes, puede significar que el cliente no entregó un documento necesario, lo que podría ser senal de mayor riesgo. También podría ser que esa información no era obligatoria al solicitar el préstamo. En cualquier caso, pueden dar pistas sobre el comportamiento o riesgo del cliente.
QUE ESTRATREGIA DECIDI SEGUIR ?¶
En algunos campos se debe elegir particularmente¶
Decidimos agrupar variables de vivienda, y variables mas especificas para imputar de dos maneras diferentes:
Utilizamos dos métodos para tratar los valores faltantes según las características de los datos.
La imputación con la media la aplico a las variables relacionadas con la vivienda y la descripción de los hogares, ya que estas variables son relativamente homogéneas y independientes de otros factores, lo que hace que este método sea simple y efectivo.
Por otro lado, usé el SimpleImputer con la strategy de mediana, para las columnas continuas que quedaban como las de AMT_ANNUITY, OWN_CAR_AGE y EXT_SOURCE_1...
Ya que es una técnica robusta con valores atípicos, que son mas o menos comunes en estas variables. La mediana garantiza una imputación mas rápida y razonable, evitando distorsiones debidas a valores extremos.
Además, permite un flujo consistente y escalable entre conjuntos de entrenamiento y prueba, asegurando que las mismas imputaciones se apliquen de forma uniforme. Si se necesita mayor precisión o se identifican relaciones complejas entre variables, podría implementarse un KNearestNeighbors para predecir mejor valores missing con sus dependencias asociadas a sus 'vecinos'os.
# copio los datos originales
data_train_input = data_train.copy()
data_test_input = data_test.copy()
#alisto mis columnas continuas relacionadas con la vivienda cuales nulos cambiaré con la media:
housing_continues_columns = [
'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG',
'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE',
'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI',
'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI',
'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI',
'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE'
]
#IMPUTANDO CON LA MEDIA EN LOS DOS CONJUNTOS DE MI DATA:
data_train_input[housing_continues_columns] = data_train[housing_continues_columns].fillna(data_train[housing_continues_columns].mean())
data_test_input[housing_continues_columns] = data_test[housing_continues_columns].fillna(data_test[housing_continues_columns].mean())
#Otras columnas que pondré con una imputacion con mediana:
remaining_continues_columns = [
'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'OWN_CAR_AGE', 'EXT_SOURCE_1',
'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR'
]
#escalado de los datos (bloques para mas eficiencia)
scaler = StandardScaler()
def scale_and_impute(data, columns, scaler, imputer):
scaled_data = scaler.fit_transform(data[columns])
return imputer.fit_transform(scaled_data)
#USO EL METODO DE LA MEDIANA:
simple_imputer = SimpleImputer(strategy="median")
#imputar con escalado para mas facilidad
data_train_imputed = scale_and_impute(data_train_input, remaining_continues_columns, scaler, simple_imputer)
data_test_imputed = scale_and_impute(data_test_input, remaining_continues_columns, scaler, simple_imputer)
#dataframe completos
data_train_imputed_df = pd.DataFrame(data_train_imputed, columns=remaining_continues_columns, index=data_train_input.index)
data_test_imputed_df = pd.DataFrame(data_test_imputed, columns=remaining_continues_columns, index=data_test_input.index)
#asignar nuevos datos al dataframe que he copiado en primer lugar
data_train_input[remaining_continues_columns] = data_train_imputed_df
data_test_input[remaining_continues_columns] = data_test_imputed_df
#COMPROBACION
print("\nValores nulos después de la imputación en train (remaining_continues_columns):")
print(data_train_input[remaining_continues_columns].isnull().sum())
print("\nValores nulos después de la imputación en test (remaining_continues_columns):")
print(data_test_input[remaining_continues_columns].isnull().sum())
Valores nulos después de la imputación en train (remaining_continues_columns): AMT_ANNUITY 0 AMT_GOODS_PRICE 0 OWN_CAR_AGE 0 EXT_SOURCE_1 0 EXT_SOURCE_2 0 EXT_SOURCE_3 0 AMT_REQ_CREDIT_BUREAU_MON 0 AMT_REQ_CREDIT_BUREAU_YEAR 0 dtype: int64 Valores nulos después de la imputación en test (remaining_continues_columns): AMT_ANNUITY 0 AMT_GOODS_PRICE 0 OWN_CAR_AGE 0 EXT_SOURCE_1 0 EXT_SOURCE_2 0 EXT_SOURCE_3 0 AMT_REQ_CREDIT_BUREAU_MON 0 AMT_REQ_CREDIT_BUREAU_YEAR 0 dtype: int64
Variables categoricas¶
print("Cantidad de valores nulos en 'TARGET':", data_train['TARGET'].isnull().sum())
Cantidad de valores nulos en 'TARGET': 0
categorical_vars
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT', 'TARGET']
#conteos antes de tratar los datos categoricos :
print("missing de categoricas antes de transformar:")
print(data_train_input[categorical_vars].isnull().sum())
print("\nTipo de cada variable:")
print(data_train_input[categorical_vars].dtypes)
print("\nValores unicos de cada variable:")
print(data_train_input[categorical_vars].nunique())
missing de categoricas antes de transformar: NAME_CONTRACT_TYPE 0 CODE_GENDER 3 FLAG_OWN_CAR 0 FLAG_OWN_REALTY 0 CNT_CHILDREN 0 NAME_TYPE_SUITE 1003 NAME_INCOME_TYPE 0 NAME_EDUCATION_TYPE 0 NAME_FAMILY_STATUS 0 NAME_HOUSING_TYPE 0 FLAG_MOBIL 0 FLAG_EMP_PHONE 0 FLAG_WORK_PHONE 0 FLAG_CONT_MOBILE 0 FLAG_PHONE 0 FLAG_EMAIL 0 OCCUPATION_TYPE 76942 CNT_FAM_MEMBERS 2 REGION_RATING_CLIENT 0 REGION_RATING_CLIENT_W_CITY 0 WEEKDAY_APPR_PROCESS_START 0 REG_REGION_NOT_LIVE_REGION 0 REG_REGION_NOT_WORK_REGION 0 LIVE_REGION_NOT_WORK_REGION 0 REG_CITY_NOT_LIVE_CITY 0 REG_CITY_NOT_WORK_CITY 0 LIVE_CITY_NOT_WORK_CITY 0 ORGANIZATION_TYPE 0 FONDKAPREMONT_MODE 168191 HOUSETYPE_MODE 123367 WALLSMATERIAL_MODE 125005 EMERGENCYSTATE_MODE 116513 FLAG_DOCUMENT_2 0 FLAG_DOCUMENT_3 0 FLAG_DOCUMENT_4 0 FLAG_DOCUMENT_5 0 FLAG_DOCUMENT_6 0 FLAG_DOCUMENT_7 0 FLAG_DOCUMENT_8 0 FLAG_DOCUMENT_9 0 FLAG_DOCUMENT_10 0 FLAG_DOCUMENT_11 0 FLAG_DOCUMENT_12 0 FLAG_DOCUMENT_13 0 FLAG_DOCUMENT_14 0 FLAG_DOCUMENT_15 0 FLAG_DOCUMENT_16 0 FLAG_DOCUMENT_17 0 FLAG_DOCUMENT_18 0 FLAG_DOCUMENT_19 0 FLAG_DOCUMENT_20 0 FLAG_DOCUMENT_21 0 AMT_REQ_CREDIT_BUREAU_HOUR 33391 AMT_REQ_CREDIT_BUREAU_DAY 33391 AMT_REQ_CREDIT_BUREAU_WEEK 33391 AMT_REQ_CREDIT_BUREAU_QRT 33391 TARGET 0 dtype: int64 Tipo de cada variable: NAME_CONTRACT_TYPE object CODE_GENDER object FLAG_OWN_CAR object FLAG_OWN_REALTY object CNT_CHILDREN int64 NAME_TYPE_SUITE object NAME_INCOME_TYPE object NAME_EDUCATION_TYPE object NAME_FAMILY_STATUS object NAME_HOUSING_TYPE object FLAG_MOBIL int64 FLAG_EMP_PHONE int64 FLAG_WORK_PHONE int64 FLAG_CONT_MOBILE int64 FLAG_PHONE int64 FLAG_EMAIL int64 OCCUPATION_TYPE object CNT_FAM_MEMBERS float64 REGION_RATING_CLIENT int64 REGION_RATING_CLIENT_W_CITY int64 WEEKDAY_APPR_PROCESS_START object REG_REGION_NOT_LIVE_REGION int64 REG_REGION_NOT_WORK_REGION int64 LIVE_REGION_NOT_WORK_REGION int64 REG_CITY_NOT_LIVE_CITY int64 REG_CITY_NOT_WORK_CITY int64 LIVE_CITY_NOT_WORK_CITY int64 ORGANIZATION_TYPE object FONDKAPREMONT_MODE object HOUSETYPE_MODE object WALLSMATERIAL_MODE object EMERGENCYSTATE_MODE object FLAG_DOCUMENT_2 int64 FLAG_DOCUMENT_3 int64 FLAG_DOCUMENT_4 int64 FLAG_DOCUMENT_5 int64 FLAG_DOCUMENT_6 int64 FLAG_DOCUMENT_7 int64 FLAG_DOCUMENT_8 int64 FLAG_DOCUMENT_9 int64 FLAG_DOCUMENT_10 int64 FLAG_DOCUMENT_11 int64 FLAG_DOCUMENT_12 int64 FLAG_DOCUMENT_13 int64 FLAG_DOCUMENT_14 int64 FLAG_DOCUMENT_15 int64 FLAG_DOCUMENT_16 int64 FLAG_DOCUMENT_17 int64 FLAG_DOCUMENT_18 int64 FLAG_DOCUMENT_19 int64 FLAG_DOCUMENT_20 int64 FLAG_DOCUMENT_21 int64 AMT_REQ_CREDIT_BUREAU_HOUR float64 AMT_REQ_CREDIT_BUREAU_DAY float64 AMT_REQ_CREDIT_BUREAU_WEEK float64 AMT_REQ_CREDIT_BUREAU_QRT float64 TARGET int64 dtype: object Valores unicos de cada variable: NAME_CONTRACT_TYPE 2 CODE_GENDER 2 FLAG_OWN_CAR 2 FLAG_OWN_REALTY 2 CNT_CHILDREN 15 NAME_TYPE_SUITE 7 NAME_INCOME_TYPE 8 NAME_EDUCATION_TYPE 5 NAME_FAMILY_STATUS 6 NAME_HOUSING_TYPE 6 FLAG_MOBIL 1 FLAG_EMP_PHONE 2 FLAG_WORK_PHONE 2 FLAG_CONT_MOBILE 2 FLAG_PHONE 2 FLAG_EMAIL 2 OCCUPATION_TYPE 18 CNT_FAM_MEMBERS 17 REGION_RATING_CLIENT 3 REGION_RATING_CLIENT_W_CITY 3 WEEKDAY_APPR_PROCESS_START 7 REG_REGION_NOT_LIVE_REGION 2 REG_REGION_NOT_WORK_REGION 2 LIVE_REGION_NOT_WORK_REGION 2 REG_CITY_NOT_LIVE_CITY 2 REG_CITY_NOT_WORK_CITY 2 LIVE_CITY_NOT_WORK_CITY 2 ORGANIZATION_TYPE 58 FONDKAPREMONT_MODE 4 HOUSETYPE_MODE 3 WALLSMATERIAL_MODE 7 EMERGENCYSTATE_MODE 2 FLAG_DOCUMENT_2 2 FLAG_DOCUMENT_3 2 FLAG_DOCUMENT_4 2 FLAG_DOCUMENT_5 2 FLAG_DOCUMENT_6 2 FLAG_DOCUMENT_7 2 FLAG_DOCUMENT_8 2 FLAG_DOCUMENT_9 2 FLAG_DOCUMENT_10 2 FLAG_DOCUMENT_11 2 FLAG_DOCUMENT_12 2 FLAG_DOCUMENT_13 2 FLAG_DOCUMENT_14 2 FLAG_DOCUMENT_15 2 FLAG_DOCUMENT_16 2 FLAG_DOCUMENT_17 2 FLAG_DOCUMENT_18 2 FLAG_DOCUMENT_19 2 FLAG_DOCUMENT_20 2 FLAG_DOCUMENT_21 2 AMT_REQ_CREDIT_BUREAU_HOUR 5 AMT_REQ_CREDIT_BUREAU_DAY 9 AMT_REQ_CREDIT_BUREAU_WEEK 9 AMT_REQ_CREDIT_BUREAU_QRT 10 TARGET 2 dtype: int64
#bucle para tener las correlaciones categoricas de cada variable categorica respeto a la variable objetivo:
for var in categorical_vars:
print('------------------------------------------------------------')
print(f"Confusion matrix parfa {var} con respeto a TARGET:")
confusion_matrix = pd.crosstab(data_train['TARGET'], data_train_input[var])
print(confusion_matrix)
#coeficiente de Cramers v:
cramers_v_value = eda.cramers_v(confusion_matrix.values)
print(f"Cramér's V for {var}: {cramers_v_value}\n")
------------------------------------------------------------ Confusion matrix parfa NAME_CONTRACT_TYPE con respeto a TARGET: NAME_CONTRACT_TYPE CASH LOANS REVOLVING LOANS TARGET 0 204106 22042 1 18590 1270 Cramér's V for NAME_CONTRACT_TYPE: 0.0310865205023682 ------------------------------------------------------------ Confusion matrix parfa CODE_GENDER con respeto a TARGET: CODE_GENDER FEMALE MALE TARGET 0 150573 75572 1 11307 8553 Cramér's V for CODE_GENDER: 0.05535981073628126 ------------------------------------------------------------ Confusion matrix parfa FLAG_OWN_CAR con respeto a TARGET: FLAG_OWN_CAR NO YES TARGET 0 148517 77631 1 13807 6053 Cramér's V for FLAG_OWN_CAR: 0.022025230924775974 ------------------------------------------------------------ Confusion matrix parfa FLAG_OWN_REALTY con respeto a TARGET: FLAG_OWN_REALTY NO YES TARGET 0 69132 157016 1 6341 13519 Cramér's V for FLAG_OWN_REALTY: 0.007754761693103105 ------------------------------------------------------------ Confusion matrix parfa CNT_CHILDREN con respeto a TARGET: CNT_CHILDREN 0 1 2 3 4 5 6 7 8 9 10 11 12 \ TARGET 0 159018 44541 19528 2674 302 60 11 5 2 0 1 0 2 1 13325 4327 1861 287 46 6 5 0 0 2 0 1 0 CNT_CHILDREN 14 19 TARGET 0 2 2 1 0 0 Cramér's V for CNT_CHILDREN: 0.023388575925143276 ------------------------------------------------------------ Confusion matrix parfa NAME_TYPE_SUITE con respeto a TARGET: NAME_TYPE_SUITE CHILDREN FAMILY GROUP OF PEOPLE OTHER_A OTHER_B \ TARGET 0 2468 29724 200 637 1266 1 193 2399 17 62 140 NAME_TYPE_SUITE SPOUSE, PARTNER UNACCOMPANIED TARGET 0 8286 182618 1 713 16282 Cramér's V for NAME_TYPE_SUITE: 0.009732772442277339 ------------------------------------------------------------ Confusion matrix parfa NAME_INCOME_TYPE con respeto a TARGET: NAME_INCOME_TYPE BUSINESSMAN COMMERCIAL ASSOCIATE MATERNITY LEAVE \ TARGET 0 10 53077 3 1 0 4324 1 NAME_INCOME_TYPE PENSIONER STATE SERVANT STUDENT UNEMPLOYED WORKING TARGET 0 41765 16381 13 13 114886 1 2367 1025 0 5 12138 Cramér's V for NAME_INCOME_TYPE: 0.062250216935764706 ------------------------------------------------------------ Confusion matrix parfa NAME_EDUCATION_TYPE con respeto a TARGET: NAME_EDUCATION_TYPE ACADEMIC DEGREE HIGHER EDUCATION INCOMPLETE HIGHER \ TARGET 0 127 56774 7539 1 2 3192 695 NAME_EDUCATION_TYPE LOWER SECONDARY SECONDARY / SECONDARY SPECIAL TARGET 0 2665 159043 1 338 15633 Cramér's V for NAME_EDUCATION_TYPE: 0.058359656764498065 ------------------------------------------------------------ Confusion matrix parfa NAME_FAMILY_STATUS con respeto a TARGET: NAME_FAMILY_STATUS CIVIL MARRIAGE MARRIED SEPARATED SINGLE / NOT MARRIED \ TARGET 0 21364 145299 14534 32785 1 2375 11842 1329 3557 NAME_FAMILY_STATUS UNKNOWN WIDOW TARGET 0 2 12164 1 0 757 Cramér's V for NAME_FAMILY_STATUS: 0.04067121943031738 ------------------------------------------------------------ Confusion matrix parfa NAME_HOUSING_TYPE con respeto a TARGET: NAME_HOUSING_TYPE CO-OP APARTMENT HOUSE / APARTMENT MUNICIPAL APARTMENT \ TARGET 0 834 201252 8228 1 72 17008 753 NAME_HOUSING_TYPE OFFICE APARTMENT RENTED APARTMENT WITH PARENTS TARGET 0 1950 3460 10424 1 137 484 1406 Cramér's V for NAME_HOUSING_TYPE: 0.03777646043528446 ------------------------------------------------------------ Confusion matrix parfa FLAG_MOBIL con respeto a TARGET: FLAG_MOBIL 1 TARGET 0 226148 1 19860 Cramér's V for FLAG_MOBIL: nan ------------------------------------------------------------ Confusion matrix parfa FLAG_EMP_PHONE con respeto a TARGET: FLAG_EMP_PHONE 0 1 TARGET 0 41778 184370 1 2373 17487 Cramér's V for FLAG_EMP_PHONE: 0.04625807891255162 ------------------------------------------------------------ Confusion matrix parfa FLAG_WORK_PHONE con respeto a TARGET:
C:\Users\ninao\Desktop\ML_practica1EDA\notebooks\../src\eda_utils.py:549: RuntimeWarning: invalid value encountered in double_scalars return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
FLAG_WORK_PHONE 0 1 TARGET 0 181826 44322 1 15095 4765 Cramér's V for FLAG_WORK_PHONE: 0.02986670896835815 ------------------------------------------------------------ Confusion matrix parfa FLAG_CONT_MOBILE con respeto a TARGET: FLAG_CONT_MOBILE 0 1 TARGET 0 427 225721 1 36 19824 Cramér's V for FLAG_CONT_MOBILE: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_PHONE con respeto a TARGET: FLAG_PHONE 0 1 TARGET 0 161863 64285 1 14939 4921 Cramér's V for FLAG_PHONE: 0.021990638240084764 ------------------------------------------------------------ Confusion matrix parfa FLAG_EMAIL con respeto a TARGET: FLAG_EMAIL 0 1 TARGET 0 213165 12983 1 18773 1087 Cramér's V for FLAG_EMAIL: 0.0023645650771151965 ------------------------------------------------------------ Confusion matrix parfa OCCUPATION_TYPE con respeto a TARGET: OCCUPATION_TYPE ACCOUNTANTS CLEANING STAFF COOKING STAFF CORE STAFF \ TARGET 0 7514 3362 4276 20671 1 379 372 507 1410 OCCUPATION_TYPE DRIVERS HIGH SKILL TECH STAFF HR STAFF IT STAFF LABORERS \ TARGET 0 13195 8584 436 389 39540 1 1699 560 28 32 4652 OCCUPATION_TYPE LOW-SKILL LABORERS MANAGERS MEDICINE STAFF \ TARGET 0 1383 16048 6342 1 296 1092 454 OCCUPATION_TYPE PRIVATE SERVICE STAFF REALTY AGENTS SALES STAFF \ TARGET 0 1992 549 23201 1 136 44 2446 OCCUPATION_TYPE SECRETARIES SECURITY STAFF WAITERS/BARMEN STAFF TARGET 0 993 4751 959 1 74 585 115 Cramér's V for OCCUPATION_TYPE: 0.08084491419477519 ------------------------------------------------------------ Confusion matrix parfa CNT_FAM_MEMBERS con respeto a TARGET: CNT_FAM_MEMBERS 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 \ TARGET 0 49807 117020 38380 18067 2503 288 57 10 5 1 4573 9611 3652 1702 262 47 5 5 0 CNT_FAM_MEMBERS 10.0 11.0 12.0 13.0 14.0 15.0 16.0 20.0 TARGET 0 2 0 1 0 2 1 1 2 1 1 1 0 1 0 0 0 0 Cramér's V for CNT_FAM_MEMBERS: 0.022443597931890227 ------------------------------------------------------------ Confusion matrix parfa REGION_RATING_CLIENT con respeto a TARGET: REGION_RATING_CLIENT 1 2 3 TARGET 0 24430 167304 34414 1 1262 14307 4291 Cramér's V for REGION_RATING_CLIENT: 0.05797431452905184 ------------------------------------------------------------ Confusion matrix parfa REGION_RATING_CLIENT_W_CITY con respeto a TARGET: REGION_RATING_CLIENT_W_CITY 1 2 3 TARGET 0 25941 169054 31153 1 1349 14489 4022 Cramér's V for REGION_RATING_CLIENT_W_CITY: 0.060540064376819976 ------------------------------------------------------------ Confusion matrix parfa WEEKDAY_APPR_PROCESS_START con respeto a TARGET: WEEKDAY_APPR_PROCESS_START FRIDAY MONDAY SATURDAY SUNDAY THURSDAY \ TARGET 0 36887 37472 24962 11869 37218 1 3281 3152 2141 1022 3305 WEEKDAY_APPR_PROCESS_START TUESDAY WEDNESDAY TARGET 0 39534 38206 1 3580 3379 Cramér's V for WEEKDAY_APPR_PROCESS_START: 0.004495454281500501 ------------------------------------------------------------ Confusion matrix parfa REG_REGION_NOT_LIVE_REGION con respeto a TARGET: REG_REGION_NOT_LIVE_REGION 0 1 TARGET 0 222748 3400 1 19502 358 Cramér's V for REG_REGION_NOT_LIVE_REGION: 0.0062680653140641555 ------------------------------------------------------------ Confusion matrix parfa REG_REGION_NOT_WORK_REGION con respeto a TARGET: REG_REGION_NOT_WORK_REGION 0 1 TARGET 0 214743 11405 1 18745 1115 Cramér's V for REG_REGION_NOT_WORK_REGION: 0.006750820541451695 ------------------------------------------------------------ Confusion matrix parfa LIVE_REGION_NOT_WORK_REGION con respeto a TARGET: LIVE_REGION_NOT_WORK_REGION 0 1 TARGET 0 216953 9195 1 19015 845 Cramér's V for LIVE_REGION_NOT_WORK_REGION: 0.0015818927525979585 ------------------------------------------------------------ Confusion matrix parfa REG_CITY_NOT_LIVE_CITY con respeto a TARGET: REG_CITY_NOT_LIVE_CITY 0 1 TARGET 0 209170 16978 1 17481 2379 Cramér's V for REG_CITY_NOT_LIVE_CITY: 0.04516813958686674 ------------------------------------------------------------ Confusion matrix parfa REG_CITY_NOT_WORK_CITY con respeto a TARGET: REG_CITY_NOT_WORK_CITY 0 1 TARGET 0 175412 50736 1 13878 5982 Cramér's V for REG_CITY_NOT_WORK_CITY: 0.04965344090426648 ------------------------------------------------------------ Confusion matrix parfa LIVE_CITY_NOT_WORK_CITY con respeto a TARGET: LIVE_CITY_NOT_WORK_CITY 0 1 TARGET 0 186403 39745 1 15493 4367 Cramér's V for LIVE_CITY_NOT_WORK_CITY: 0.03126228732573633 ------------------------------------------------------------ Confusion matrix parfa ORGANIZATION_TYPE con respeto a TARGET: ORGANIZATION_TYPE ADVERTISING AGRICULTURE BANK BUSINESS ENTITY TYPE 1 \ TARGET 0 316 1761 1858 4360 1 29 207 102 394 ORGANIZATION_TYPE BUSINESS ENTITY TYPE 2 BUSINESS ENTITY TYPE 3 CLEANING \ TARGET 0 7733 49490 193 1 723 5107 22 ORGANIZATION_TYPE CONSTRUCTION CULTURE ELECTRICITY EMERGENCY GOVERNMENT \ TARGET 0 4787 291 716 418 7661 1 632 15 50 31 567 ORGANIZATION_TYPE HOTEL HOUSING INDUSTRY: TYPE 1 INDUSTRY: TYPE 10 \ TARGET 0 738 2196 731 84 1 48 185 92 3 ORGANIZATION_TYPE INDUSTRY: TYPE 11 INDUSTRY: TYPE 12 INDUSTRY: TYPE 13 \ TARGET 0 1959 291 45 1 195 12 7 ORGANIZATION_TYPE INDUSTRY: TYPE 2 INDUSTRY: TYPE 3 INDUSTRY: TYPE 4 \ TARGET 0 338 2341 631 1 21 262 74 ORGANIZATION_TYPE INDUSTRY: TYPE 5 INDUSTRY: TYPE 6 INDUSTRY: TYPE 7 \ TARGET 0 457 81 972 1 32 6 80 ORGANIZATION_TYPE INDUSTRY: TYPE 8 INDUSTRY: TYPE 9 INSURANCE \ TARGET 0 16 2535 454 1 2 185 25 ORGANIZATION_TYPE KINDERGARTEN LEGAL SERVICES MEDICINE MILITARY MOBILE \ TARGET 0 5157 229 8385 1974 228 1 383 22 590 110 22 ORGANIZATION_TYPE OTHER POLICE POSTAL REALTOR RELIGION RESTAURANT \ TARGET 0 12335 1794 1568 285 61 1295 1 1030 99 141 36 3 166 ORGANIZATION_TYPE SCHOOL SECURITY SECURITY MINISTRIES SELF-EMPLOYED \ TARGET 0 6684 2305 1494 27621 1 421 273 68 3105 ORGANIZATION_TYPE SERVICES TELECOM TRADE: TYPE 1 TRADE: TYPE 2 \ TARGET 0 1163 421 243 1419 1 80 37 27 104 ORGANIZATION_TYPE TRADE: TYPE 3 TRADE: TYPE 4 TRADE: TYPE 5 TRADE: TYPE 6 \ TARGET 0 2561 52 38 464 1 295 0 3 19 ORGANIZATION_TYPE TRADE: TYPE 7 TRANSPORT: TYPE 1 TRANSPORT: TYPE 2 \ TARGET 0 5670 157 1619 1 583 7 140 ORGANIZATION_TYPE TRANSPORT: TYPE 3 TRANSPORT: TYPE 4 UNIVERSITY XNA TARGET 0 800 3903 1001 41769 1 154 403 59 2372 Cramér's V for ORGANIZATION_TYPE: 0.07154282732719068 ------------------------------------------------------------ Confusion matrix parfa FONDKAPREMONT_MODE con respeto a TARGET: FONDKAPREMONT_MODE NOT SPECIFIED ORG SPEC ACCOUNT REG OPER ACCOUNT \ TARGET 0 4183 4231 54982 1 348 264 4146 FONDKAPREMONT_MODE REG OPER SPEC ACCOUNT TARGET 0 9040 1 623 Cramér's V for FONDKAPREMONT_MODE: 0.012850572159249297 ------------------------------------------------------------ Confusion matrix parfa HOUSETYPE_MODE con respeto a TARGET: HOUSETYPE_MODE BLOCK OF FLATS SPECIFIC HOUSING TERRACED HOUSE TARGET 0 112116 1056 869 1 8389 125 86 Cramér's V for HOUSETYPE_MODE: 0.014947668199276329 ------------------------------------------------------------ Confusion matrix parfa WALLSMATERIAL_MODE con respeto a TARGET: WALLSMATERIAL_MODE BLOCK MIXED MONOLITHIC OTHERS PANEL STONE, BRICK \ TARGET 0 6902 1659 1355 1188 49517 48030 1 526 140 65 111 3368 3849 WALLSMATERIAL_MODE WOODEN TARGET 0 3875 1 418 Cramér's V for WALLSMATERIAL_MODE: 0.03001729551410803 ------------------------------------------------------------ Confusion matrix parfa EMERGENCYSTATE_MODE con respeto a TARGET: EMERGENCYSTATE_MODE NO YES TARGET 0 118703 1699 1 8913 180 Cramér's V for EMERGENCYSTATE_MODE: 0.01169416294243119 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_2 con respeto a TARGET: FLAG_DOCUMENT_2 0 1 TARGET 0 226139 9 1 19857 3 Cramér's V for FLAG_DOCUMENT_2: 0.0025764726142198224 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_3 con respeto a TARGET: FLAG_DOCUMENT_3 0 1 TARGET 0 66692 159456 1 4371 15489 Cramér's V for FLAG_DOCUMENT_3: 0.0449053947444949 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_4 con respeto a TARGET: FLAG_DOCUMENT_4 0 1 TARGET 0 226128 20 1 19860 0 Cramér's V for FLAG_DOCUMENT_4: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_5 con respeto a TARGET: FLAG_DOCUMENT_5 0 1 TARGET 0 222737 3411 1 19568 292 Cramér's V for FLAG_DOCUMENT_5: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_6 con respeto a TARGET: FLAG_DOCUMENT_6 0 1 TARGET 0 205844 20304 1 18673 1187 Cramér's V for FLAG_DOCUMENT_6: 0.028860072636401262 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_7 con respeto a TARGET: FLAG_DOCUMENT_7 0 1 TARGET 0 226106 42 1 19858 2 Cramér's V for FLAG_DOCUMENT_7: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_8 con respeto a TARGET: FLAG_DOCUMENT_8 0 1 TARGET 0 207586 18562 1 18396 1464 Cramér's V for FLAG_DOCUMENT_8: 0.008055564379763382 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_9 con respeto a TARGET: FLAG_DOCUMENT_9 0 1 TARGET 0 225254 894 1 19801 59 Cramér's V for FLAG_DOCUMENT_9: 0.003670711674578854 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_10 con respeto a TARGET: FLAG_DOCUMENT_10 0 1 TARGET 0 226141 7 1 19860 0 Cramér's V for FLAG_DOCUMENT_10: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_11 con respeto a TARGET: FLAG_DOCUMENT_11 0 1 TARGET 0 225250 898 1 19800 60 Cramér's V for FLAG_DOCUMENT_11: 0.003494245356856063 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_12 con respeto a TARGET: FLAG_DOCUMENT_12 0 1 TARGET 0 226146 2 1 19860 0 Cramér's V for FLAG_DOCUMENT_12: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_13 con respeto a TARGET: FLAG_DOCUMENT_13 0 1 TARGET 0 225280 868 1 19832 28 Cramér's V for FLAG_DOCUMENT_13: 0.010668706807718871 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_14 con respeto a TARGET: FLAG_DOCUMENT_14 0 1 TARGET 0 225463 685 1 19836 24 Cramér's V for FLAG_DOCUMENT_14: 0.008886516785392559 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_15 con respeto a TARGET: FLAG_DOCUMENT_15 0 1 TARGET 0 225858 290 1 19851 9 Cramér's V for FLAG_DOCUMENT_15: 0.0059359632723035675 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_16 con respeto a TARGET: FLAG_DOCUMENT_16 0 1 TARGET 0 223848 2300 1 19743 117 Cramér's V for FLAG_DOCUMENT_16: 0.01156868111186073 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_17 con respeto a TARGET: FLAG_DOCUMENT_17 0 1 TARGET 0 226086 62 1 19858 2 Cramér's V for FLAG_DOCUMENT_17: 0.0014222099477547469 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_18 con respeto a TARGET: FLAG_DOCUMENT_18 0 1 TARGET 0 224229 1919 1 19746 114 Cramér's V for FLAG_DOCUMENT_18: 0.007926583281735216 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_19 con respeto a TARGET: FLAG_DOCUMENT_19 0 1 TARGET 0 226002 146 1 19850 10 Cramér's V for FLAG_DOCUMENT_19: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_20 con respeto a TARGET: FLAG_DOCUMENT_20 0 1 TARGET 0 226032 116 1 19848 12 Cramér's V for FLAG_DOCUMENT_20: 0.0 ------------------------------------------------------------ Confusion matrix parfa FLAG_DOCUMENT_21 con respeto a TARGET: FLAG_DOCUMENT_21 0 1 TARGET 0 226079 69 1 19849 11 Cramér's V for FLAG_DOCUMENT_21: 0.0026689070623304367 ------------------------------------------------------------ Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_HOUR con respeto a TARGET: AMT_REQ_CREDIT_BUREAU_HOUR 0.0 1.0 2.0 3.0 4.0 TARGET 0 195041 1116 42 6 1 1 16304 101 6 0 0 Cramér's V for AMT_REQ_CREDIT_BUREAU_HOUR: 0.0 ------------------------------------------------------------ Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_DAY con respeto a TARGET: AMT_REQ_CREDIT_BUREAU_DAY 0.0 1.0 2.0 3.0 4.0 5.0 6.0 8.0 9.0 TARGET 0 195139 919 76 35 19 8 7 1 2 1 16300 97 11 1 2 0 0 0 0 Cramér's V for AMT_REQ_CREDIT_BUREAU_DAY: 0.0034934942279904696 ------------------------------------------------------------ Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_WEEK con respeto a TARGET: AMT_REQ_CREDIT_BUREAU_WEEK 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 TARGET 0 189909 6052 143 43 28 8 16 2 5 1 15877 510 16 3 3 1 1 0 0 Cramér's V for AMT_REQ_CREDIT_BUREAU_WEEK: 0.0 ------------------------------------------------------------ Confusion matrix parfa AMT_REQ_CREDIT_BUREAU_QRT con respeto a TARGET: AMT_REQ_CREDIT_BUREAU_QRT 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 \ TARGET 0 158758 25235 10519 1279 345 41 18 5 1 13538 1768 952 95 46 3 7 1 AMT_REQ_CREDIT_BUREAU_QRT 8.0 19.0 TARGET 0 6 0 1 0 1 Cramér's V for AMT_REQ_CREDIT_BUREAU_QRT: 0.020681870368418964 ------------------------------------------------------------ Confusion matrix parfa TARGET con respeto a TARGET: TARGET 0 1 TARGET 0 226148 0 1 0 19860 Cramér's V for TARGET: 0.9999726127135284
En resumen, aunque algunas variables muestran des diferencias en las matrices de confusión, su capacidad para predecir el incumplimiento de pago parece limitada debido a los bajos valores del coeficiente de Cramer.
Las variables más discriminantes entre las clases de TARGET son FLAG_MOBIL, FLAG_EMP_PHONE, FLAG_WORK_PHONE, y variables regionales como REG_REGION_NOT_LIVE_REGION y REG_CITY_NOT_LIVE_CITY. Estas variables presentan grandes diferencias en sus distribuciones y podrían ser indicadores importantes para la predicción de TARGET.
El valor de Cramér's V mide la fuerza de la asociación entre dos variables categóricas, y varía entre 0 y 1. Algunas de las valores importantes obtenidos son
- ORGANIZATION_TYPE: 0.0715, una asociación bastante moderada (si comparamos con las otras variables) con la variable TARGET.
- Otras variables (como FLAG_DOCUMENT_2, FLAG_DOCUMENT_10, etc.) tienen valores de Cramér's V cercanos a cero, lo que indica una asociación muy débil o nula.
VALORES NULOS DE CATEGORICAS A TRATAR /¶
- Decidi coger las variables que a primer vista se pueden rellenar con la moda (el valor mas frecuente); porque a lo mejor son variables que son indispensables para tener un modelo preciso.
- Los documentos si son faltantes significa que probablemente no han sido entregados asi que decidi rellenar con 0s.
- Y agrupar las que tienen sentido estar rellenadas con 'sin valor' o 'desconocido'
#mis de columnas categóricas
categorical_vars = [
'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE',
'EMERGENCYSTATE_MODE', 'ORGANIZATION_TYPE', 'FLAG_DOCUMENT_2',
'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10',
'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21',
'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT'
]
#les divido segun la imputacion que decidi seguir:
fill_with_most_frequent = [
'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'OCCUPATION_TYPE'
]
fill_with_unknown = [
'NAME_TYPE_SUITE', 'NAME_HOUSING_TYPE', 'FONDKAPREMONT_MODE',
'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT'
]
fill_with_zero = [
'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9',
'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
]
#imputar
#moda
imputer = SimpleImputer(strategy='most_frequent')
data_train_input[fill_with_most_frequent] = imputer.fit_transform(data_train_input[fill_with_most_frequent])
data_test_input[fill_with_most_frequent] = imputer.transform(data_test_input[fill_with_most_frequent])
#unknown
for col in fill_with_unknown:
if col in data_train_input.columns:
# Verificar si "Desconocido" ya está como categoría antes de añadirla
if data_train_input[col].dtype.name == "category" and "Desconocido" not in data_train_input[col].cat.categories:
data_train_input[col] = data_train_input[col].cat.add_categories(["Desconocido"])
if data_test_input[col].dtype.name == "category" and "Desconocido" not in data_test_input[col].cat.categories:
data_test_input[col] = data_test_input[col].cat.add_categories(["Desconocido"])
data_train_input[fill_with_unknown] = data_train_input[fill_with_unknown].fillna("Desconocido")
data_test_input[fill_with_unknown] = data_test_input[fill_with_unknown].fillna("Desconocido")
#0s
data_train_input[fill_with_zero] = data_train_input[fill_with_zero].fillna(0)
data_test_input[fill_with_zero] = data_test_input[fill_with_zero].fillna(0)
# Verificación
print("Valores faltantes después de la imputación en data_train_input:")
print(data_train_input[categorical_vars].isnull().sum())
print("\nValores faltantes después de la imputación en data_test_input:")
print(data_test_input[categorical_vars].isnull().sum())
Valores faltantes después de la imputación en data_train_input: NAME_CONTRACT_TYPE 0 CODE_GENDER 0 FLAG_OWN_CAR 0 FLAG_OWN_REALTY 0 NAME_TYPE_SUITE 0 NAME_INCOME_TYPE 0 NAME_EDUCATION_TYPE 0 NAME_FAMILY_STATUS 0 NAME_HOUSING_TYPE 0 OCCUPATION_TYPE 0 FONDKAPREMONT_MODE 0 HOUSETYPE_MODE 0 WALLSMATERIAL_MODE 0 EMERGENCYSTATE_MODE 0 ORGANIZATION_TYPE 0 FLAG_DOCUMENT_2 0 FLAG_DOCUMENT_3 0 FLAG_DOCUMENT_4 0 FLAG_DOCUMENT_5 0 FLAG_DOCUMENT_6 0 FLAG_DOCUMENT_7 0 FLAG_DOCUMENT_8 0 FLAG_DOCUMENT_9 0 FLAG_DOCUMENT_10 0 FLAG_DOCUMENT_11 0 FLAG_DOCUMENT_12 0 FLAG_DOCUMENT_13 0 FLAG_DOCUMENT_14 0 FLAG_DOCUMENT_15 0 FLAG_DOCUMENT_16 0 FLAG_DOCUMENT_17 0 FLAG_DOCUMENT_18 0 FLAG_DOCUMENT_19 0 FLAG_DOCUMENT_20 0 FLAG_DOCUMENT_21 0 AMT_REQ_CREDIT_BUREAU_HOUR 0 AMT_REQ_CREDIT_BUREAU_DAY 0 AMT_REQ_CREDIT_BUREAU_WEEK 0 AMT_REQ_CREDIT_BUREAU_QRT 0 dtype: int64 Valores faltantes después de la imputación en data_test_input: NAME_CONTRACT_TYPE 0 CODE_GENDER 0 FLAG_OWN_CAR 0 FLAG_OWN_REALTY 0 NAME_TYPE_SUITE 0 NAME_INCOME_TYPE 0 NAME_EDUCATION_TYPE 0 NAME_FAMILY_STATUS 0 NAME_HOUSING_TYPE 0 OCCUPATION_TYPE 0 FONDKAPREMONT_MODE 0 HOUSETYPE_MODE 0 WALLSMATERIAL_MODE 0 EMERGENCYSTATE_MODE 0 ORGANIZATION_TYPE 0 FLAG_DOCUMENT_2 0 FLAG_DOCUMENT_3 0 FLAG_DOCUMENT_4 0 FLAG_DOCUMENT_5 0 FLAG_DOCUMENT_6 0 FLAG_DOCUMENT_7 0 FLAG_DOCUMENT_8 0 FLAG_DOCUMENT_9 0 FLAG_DOCUMENT_10 0 FLAG_DOCUMENT_11 0 FLAG_DOCUMENT_12 0 FLAG_DOCUMENT_13 0 FLAG_DOCUMENT_14 0 FLAG_DOCUMENT_15 0 FLAG_DOCUMENT_16 0 FLAG_DOCUMENT_17 0 FLAG_DOCUMENT_18 0 FLAG_DOCUMENT_19 0 FLAG_DOCUMENT_20 0 FLAG_DOCUMENT_21 0 AMT_REQ_CREDIT_BUREAU_HOUR 0 AMT_REQ_CREDIT_BUREAU_DAY 0 AMT_REQ_CREDIT_BUREAU_WEEK 0 AMT_REQ_CREDIT_BUREAU_QRT 0 dtype: int64
Guardado de la tabla¶
# Rutas para guardar los archivos
train_output_path = "../data/processed_data/processed_data_input/data_train_preprocessing_missing_outlier.csv"
test_output_path = "../data/processed_data/processed_data_input/data_test_preprocessing_missing_outlier.csv"
# Crear las carpetas (si no existen todavia)
os.makedirs(os.path.dirname(train_output_path), exist_ok=True)
os.makedirs(os.path.dirname(test_output_path), exist_ok=True)
# Guardar los DataFrames como .CSV
data_train_input.to_csv(train_output_path, index=False)
data_test_input.to_csv(test_output_path, index=False)
print(f"Archivo de entrenamiento guardado en: {train_output_path}")
print(f"Archivo de prueba guardado en: {test_output_path}")
Archivo de entrenamiento guardado en: ../data/processed_data/processed_data_input/data_train_preprocessing_missing_outlier.csv Archivo de prueba guardado en: ../data/processed_data/processed_data_input/data_test_preprocessing_missing_outlier.csv
print(data_train_input.shape, data_test_input.shape)
(246008, 118) (61503, 118)